In [1]:
import logging
import tensorflow as tf
import mlflow
import random
import pandas as pd
import numpy as np

import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

sys.argv.clear()
sys.argv.append("")

In [2]:
from src import config
from src.features import preprocessing,sequences,knowledge
from src.training import models
from src import refinement

# four types of knowledge for Huawei:
# gram, text, causal, log template(gram_logs)
experiment_config = config.ExperimentConfig()
experiment_config.model_type = "gram"
experiment_config.sequence_type = "huawei_logs"

model_config = models.config.ModelConfig()
model_config.rnn_type: str = "gru"

# keep all other default configurations 
huawei_preprocessor_config = preprocessing.huawei.HuaweiPreprocessorConfig()
# for template_hierarchy
huawei_preprocessor_config.use_log_hierarchy = True

sequence_config = sequences.config.SequenceConfig()
knowledge_config = knowledge.config.KnowledgeConfig()
refinement_config = refinement.config.RefinementConfig()

In [3]:
def log_all_configs_to_mlflow():
    for config in [
        experiment_config,
        huawei_preprocessor_config,
        sequence_config,
        model_config,
        knowledge_config,
        refinement_config,
    ]:
        for config_name, config_value in vars(config).items():
            full_config_name = config.__class__.__name__ + config_name
            mlflow.log_param(full_config_name, str(config_value))

In [4]:
mlflow.set_experiment("Domain Guided Monitoring")
with mlflow.start_run() as run:
    run_id = run.info.run_id
logging.info("Starting run %s", run_id)
tf.random.set_seed(experiment_config.tensorflow_seed)
random.seed(experiment_config.random_seed)

## Load Huawei sequences

In [5]:
sequence_preprocessor = preprocessing.ConcurrentAggregatedLogsPreprocessor(
    huawei_preprocessor_config,
)
sequence_column_name = sequence_preprocessor.sequence_column_name
sequence_df = sequence_preprocessor.load_data()

transformer = sequences.transformer.NextPartialSequenceTransformerFromDataframe(sequence_config)
metadata = transformer.collect_metadata(sequence_df, sequence_column_name)

  sequence_df = sequence_preprocessor.load_data()
Generating DRAIN clusters from log_df: 100%|███████████████| 152373/152373 [00:21<00:00, 6988.70it/s]
Generating DRAIN clusters from log_df: 100%|███████████████| 152373/152373 [00:15<00:00, 9689.68it/s]
Generating DRAIN clusters from log_df: 100%|████████████████████| 970/970 [00:00<00:00, 14243.76it/s]


## Generate dataset

In [6]:
sequence_df_pkl_file: str = "data/sequences_df.pkl"
sequence_df.to_pickle(sequence_df_pkl_file)

train_sequences, test_sequences = transformer._split_train_test(sequence_df, sequence_column_name)

def generate(for_train):
    relevant_sequences = train_sequences if for_train else test_sequences
    for sequence in relevant_sequences:
        split_sequences = transformer._split_sequence(sequence)
        for split_sequence in split_sequences:
            transformer._translate_and_pad(split_sequence, metadata)
            yield split_sequence.x_vecs_stacked, split_sequence.y_vec

def generate_train():
    return generate(for_train=True)

def generate_test():
    return generate(for_train=False)

In [7]:
train_dataset = (
    tf.data.Dataset.from_generator(
        generate_train,
        output_types=(tf.float32, tf.float32),
    )
    .shuffle(
        experiment_config.dataset_shuffle_buffer,
        seed=experiment_config.dataset_shuffle_seed,
        reshuffle_each_iteration=True,
    )
    .batch(experiment_config.batch_size)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

test_dataset = (
    tf.data.Dataset.from_generator(
        generate_test,
        output_types=(tf.float32, tf.float32),
    )
    .batch(experiment_config.batch_size)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

2022-05-30 07:58:18.716013: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-30 07:58:18.716031: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-30 07:58:18.716043: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (hj-ubuntu): /proc/driver/nvidia/version does not exist
2022-05-30 07:58:18.716527: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-05-30 07:58:18.740051: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2599990000 Hz
2022-05-30 07:58:18.740712: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fe178000b60 initialized for platform Host (this does not guarantee that 

## Generate log template

In [8]:
hierarchy_preprocessor = preprocessing.ConcurrentAggregatedLogsHierarchyPreprocessor(
    config = huawei_preprocessor_config
)

pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None) 
pd.set_option('max_colwidth',100)

### Step 1 Load log-only data 

In [14]:
huawei_df = sequence_preprocessor._load_log_only_data()

  exec(code_obj, self.user_global_ns, self.user_ns)
Generating DRAIN clusters from log_df: 100%|███████████████| 152373/152373 [00:20<00:00, 7388.87it/s]
Generating DRAIN clusters from log_df: 100%|███████████████| 152373/152373 [00:16<00:00, 9440.00it/s]
Generating DRAIN clusters from log_df: 100%|████████████████████| 970/970 [00:00<00:00, 14264.19it/s]


In [15]:
huawei_df.head(100)

Unnamed: 0,Hostname,log_level,programname,python_module,http_status,http_method,@timestamp,Payload,http_url,fine_log_cluster_template,fine_log_cluster_path,coarse_log_cluster_template,coarse_log_cluster_path,timestamp,url_cluster_template,url_cluster_path
0,wally113,INFO,neutron-openvswitch-agent,neutron.agent.securitygroups_rpc,,,2019-11-25T15:01:49.683000000+01:00,Security group rule updated ['78c81907-a991-4109-91df-57258ed05d42'],,security group rule updated *,5 security group rule updated *,security group rule updated *,5 security group rule,2019-11-25 15:01:49.683000+01:00,,
1,wally113,INFO,neutron-openvswitch-agent,neutron.agent.securitygroups_rpc,,,2019-11-25T15:01:49.877000000+01:00,Security group rule updated ['78c81907-a991-4109-91df-57258ed05d42'],,security group rule updated *,5 security group rule updated *,security group rule updated *,5 security group rule,2019-11-25 15:01:49.877000+01:00,,
2,wally113,WARNING,keystone,keystone.server.flask.application,,,2019-11-25T15:01:52.744000000+01:00,Could not find domain: Default.: DomainNotFound: Could not find domain: Default.,,could not find domain default domainnotfound could not find domain default,11 could not find domain default domainnotfound could not find,could not find domain default domainnotfound could not find domain default,11 could not find,2019-11-25 15:01:52.744000+01:00,,
3,wally113,INFO,glance-api,eventlet.wsgi.server,,,2019-11-25T15:01:48.510000000+01:00,"172.17.0.2 - - [25/Nov/2019 15:01:48] ""GET /v2/schemas/image HTTP/1.1"" 200 5562 0.002795",,25 nov 2019 * 01 * get v2 schemas image http 11 200 5562 *,15 * nov * * * * get v2 schemas,25 nov 2019 * * * * * * * * * * * *,15 * nov *,2019-11-25 15:01:48.510000+01:00,,
4,wally113,INFO,nova-api,nova.api.openstack,,,2019-11-25T15:01:56.520000000+01:00,http://130.149.249.123:8774/v2.1/dbf4ab7d6e84449e93e02f305534200b/servers returned with HTTP 500,,http v21 * servers returned with http 500,8 http v21 * servers returned with http *,http v21 * servers returned with http 500,8 http v21 *,2019-11-25 15:01:56.520000+01:00,,
5,wally113,ERROR,nova-api,nova.api.openstack.wsgi,,,2019-11-25T15:01:56.964000000+01:00,Unexpected exception in API method: HTTPInternalServerError: HTTP HTTPInternalServerError\n2019-...,,unexpected exception in api method httpinternalservererror http httpinternalservererror 20191125...,1115 unexpected exception in api method httpinternalservererror http httpinternalservererror *,unexpected exception in api method httpinternalservererror http httpinternalservererror 20191125...,1115 unexpected exception in,2019-11-25 15:01:56.964000+01:00,,
6,wally113,INFO,nova-api,nova.api.openstack.wsgi,,,2019-11-25T15:01:57.227000000+01:00,HTTP exception thrown: Unexpected API Error. Please report this at http://bugs.launchpad.net/nov...,,http exception thrown unexpected api error please report this at http bugslaunchpadnet nova and ...,23 http exception thrown unexpected api error please report this,http exception thrown unexpected api error please report this at http bugslaunchpadnet nova and ...,23 http exception thrown,2019-11-25 15:01:57.227000+01:00,,
7,wally113,INFO,nova-api,nova.osapi_compute.wsgi.server,,,2019-11-25T15:01:57.228000000+01:00,"172.17.0.2 ""POST /v2.1/dbf4ab7d6e84449e93e02f305534200b/servers HTTP/1.1"" status: 500 len: 647 t...",,post v21 * servers http 11 status 500 len * time *,12 post v21 * servers http * status * len,post v21 * servers http 11 status * len * time *,12 post v21 *,2019-11-25 15:01:57.228000+01:00,,
8,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-25T15:01:49.842000000+01:00,"172.17.0.2 ""DELETE /v2.0/security-groups/78c81907-a991-4109-91df-57258ed05d42 HTTP/1.1"" status: ...",,delete v20 securitygroups * http 11 status 204 len 173 time *,12 delete v20 securitygroups * http * status * len,delete v20 securitygroups * http 11 status 204 len 173 time *,12 delete v20 securitygroups,2019-11-25 15:01:49.842000+01:00,,
9,wally113,INFO,nova-api,nova.api.openstack.wsgi,,,2019-11-25T15:01:57.882000000+01:00,HTTP exception thrown: Unexpected API Error. Please report this at http://bugs.launchpad.net/nov...,,http exception thrown unexpected api error please report this at http bugslaunchpadnet nova and ...,23 http exception thrown unexpected api error please report this,http exception thrown unexpected api error please report this at http bugslaunchpadnet nova and ...,23 http exception thrown,2019-11-25 15:01:57.882000+01:00,,


### Sep 2 Load log-only hierarchy (different from attribute-only hierarchy)

In [16]:
relevant_log_columns = set(
    [x for x in sequence_preprocessor.relevant_columns if "log_cluster_template" in x]
    + ["coarse_log_cluster_path"]
)

In [17]:
attribute_hierarchy = hierarchy_preprocessor._load_attribute_hierarchy(
    huawei_df, set(["coarse_log_cluster_path"])
)

Loading hierarchy for column coarse_log_cluster_path: 100%|███████| 469/469 [00:04<00:00, 113.62it/s]


In [18]:
attribute_hierarchy

Unnamed: 0,parent_id,child_id,parent_name,child_name
0,root,coarse_log_cluster_path,root,coarse_log_cluster_path
1,coarse_log_cluster_path,coarse_log_cluster_path->38,coarse_log_cluster_path,38
2,coarse_log_cluster_path->38,coarse_log_cluster_path->38->*,38,*
3,coarse_log_cluster_path->38->*,coarse_log_cluster_path->38->*->nov,*,nov
4,coarse_log_cluster_path->38->*->nov,coarse_log_cluster_path->38->*->nov->*,nov,*
5,coarse_log_cluster_path->38->*->nov->*,coarse_log_cluster_path#38 * nov *,*,coarse_log_cluster_path#38 * nov *
6,coarse_log_cluster_path,coarse_log_cluster_path->18,coarse_log_cluster_path,18
7,coarse_log_cluster_path->18,coarse_log_cluster_path->18->running,18,running
8,coarse_log_cluster_path->18->running,coarse_log_cluster_path->18->running->novaosapi,running,novaosapi
9,coarse_log_cluster_path->18->running->novaosapi,coarse_log_cluster_path->18->running->novaosapi->compute,novaosapi,compute


In [21]:
attribute_hierarchy.append(
    hierarchy_preprocessor._load_log_hierarchy(huawei_df, relevant_log_columns),
    ignore_index=True,
).drop_duplicates().reset_index(drop=True)

Adding huawei log hierarchy: 100%|████████████████████████| 169230/169230 [00:13<00:00, 12430.85it/s]


Unnamed: 0,parent_id,child_id,parent_name,child_name
0,root,coarse_log_cluster_path,root,coarse_log_cluster_path
1,coarse_log_cluster_path,coarse_log_cluster_path->38,coarse_log_cluster_path,38
2,coarse_log_cluster_path->38,coarse_log_cluster_path->38->*,38,*
3,coarse_log_cluster_path->38->*,coarse_log_cluster_path->38->*->nov,*,nov
4,coarse_log_cluster_path->38->*->nov,coarse_log_cluster_path->38->*->nov->*,nov,*
5,coarse_log_cluster_path->38->*->nov->*,coarse_log_cluster_path#38 * nov *,*,coarse_log_cluster_path#38 * nov *
6,coarse_log_cluster_path,coarse_log_cluster_path->18,coarse_log_cluster_path,18
7,coarse_log_cluster_path->18,coarse_log_cluster_path->18->running,18,running
8,coarse_log_cluster_path->18->running,coarse_log_cluster_path->18->running->novaosapi,running,novaosapi
9,coarse_log_cluster_path->18->running->novaosapi,coarse_log_cluster_path->18->running->novaosapi->compute,novaosapi,compute


In [22]:
attribute_hierarchy

Unnamed: 0,parent_id,child_id,parent_name,child_name
0,root,coarse_log_cluster_path,root,coarse_log_cluster_path
1,coarse_log_cluster_path,coarse_log_cluster_path->38,coarse_log_cluster_path,38
2,coarse_log_cluster_path->38,coarse_log_cluster_path->38->*,38,*
3,coarse_log_cluster_path->38->*,coarse_log_cluster_path->38->*->nov,*,nov
4,coarse_log_cluster_path->38->*->nov,coarse_log_cluster_path->38->*->nov->*,nov,*
5,coarse_log_cluster_path->38->*->nov->*,coarse_log_cluster_path#38 * nov *,*,coarse_log_cluster_path#38 * nov *
6,coarse_log_cluster_path,coarse_log_cluster_path->18,coarse_log_cluster_path,18
7,coarse_log_cluster_path->18,coarse_log_cluster_path->18->running,18,running
8,coarse_log_cluster_path->18->running,coarse_log_cluster_path->18->running->novaosapi,running,novaosapi
9,coarse_log_cluster_path->18->running->novaosapi,coarse_log_cluster_path->18->running->novaosapi->compute,novaosapi,compute


### Step 3 Build hierarchy from daraframe

In [23]:
hierarchy = knowledge.HierarchyKnowledge(
    config=knowledge_config,
)
hierarchy.build_hierarchy_from_df(attribute_hierarchy, metadata.x_vocab)

Building Hierarchy from df: 2340it [00:00, 17133.68it/s]


## Genarate model

In [None]:
# load model
model = models.GramModel()

# build model
model.build(metadata, hierarchy)

## Run Experiment

In [None]:
# train model
model.train_dataset(
    train_dataset,
    test_dataset,
    experiment_config.multilabel_classification,
    experiment_config.n_epochs,
)

In [None]:
# logging dataset info
mlflow.log_metric("train_size", len([x for x in train_dataset]))
mlflow.log_metric("test_size", len([x for x in test_dataset]))
mlflow.log_metric("x_vocab_size", len(metadata.x_vocab))
mlflow.log_metric("y_vocab_size", len(metadata.y_vocab))

# generate artifacts
# skip

# set mlflow tags
mlflow.set_tag("sequence_type", experiment_config.sequence_type)
mlflow.set_tag("model_type", experiment_config.model_type)
if len(metadata.y_vocab) == 1:
    mlflow.set_tag("task_type", "risk_prediction")
else:
    mlflow.set_tag("task_type", "sequence_prediction")

logging.info("Finished run %s", run_id)