In [1]:
import logging
import tensorflow as tf
import mlflow
import random
import pandas as pd
import numpy as np
import tqdm

import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

sys.argv.clear()
sys.argv.append("")

In [2]:
from src import config
from src.features import preprocessing,sequences,knowledge
from src.training import models
from src import refinement

# four types of knowledge for Huawei:
# gram, text, causal, log template(gram_logs)
experiment_config = config.ExperimentConfig()
experiment_config.model_type = "text"
experiment_config.sequence_type = "huawei_logs"

model_config = models.config.ModelConfig()
model_config.rnn_type: str = "gru"

# keep all other default configurations 
huawei_preprocessor_config = preprocessing.huawei.HuaweiPreprocessorConfig()
sequence_config = sequences.config.SequenceConfig()
knowledge_config = knowledge.config.KnowledgeConfig()
refinement_config = refinement.config.RefinementConfig()

In [3]:
def log_all_configs_to_mlflow():
    for config in [
        experiment_config,
        huawei_preprocessor_config,
        sequence_config,
        model_config,
        knowledge_config,
        refinement_config,
    ]:
        for config_name, config_value in vars(config).items():
            full_config_name = config.__class__.__name__ + config_name
            mlflow.log_param(full_config_name, str(config_value))

In [4]:
mlflow.set_experiment("Domain Guided Monitoring")
with mlflow.start_run() as run:
    run_id = run.info.run_id
logging.info("Starting run %s", run_id)
tf.random.set_seed(experiment_config.tensorflow_seed)
random.seed(experiment_config.random_seed)

## Load Huawei sequences

In [5]:
sequence_preprocessor = preprocessing.ConcurrentAggregatedLogsPreprocessor(
    huawei_preprocessor_config,
)
sequence_column_name = sequence_preprocessor.sequence_column_name
sequence_df = sequence_preprocessor.load_data()

transformer = sequences.transformer.NextPartialSequenceTransformerFromDataframe(sequence_config)
metadata = transformer.collect_metadata(sequence_df, sequence_column_name)

  sequence_df = sequence_preprocessor.load_data()
Generating DRAIN clusters from log_df: 100%|███████████████| 152373/152373 [00:20<00:00, 7258.50it/s]
Generating DRAIN clusters from log_df: 100%|███████████████| 152373/152373 [00:15<00:00, 9600.14it/s]
Generating DRAIN clusters from log_df: 100%|████████████████████| 970/970 [00:00<00:00, 14211.82it/s]


## Generate dataset

In [6]:
sequence_df_pkl_file: str = "data/sequences_df.pkl"
sequence_df.to_pickle(sequence_df_pkl_file)

train_sequences, test_sequences = transformer._split_train_test(sequence_df, sequence_column_name)

def generate(for_train):
    relevant_sequences = train_sequences if for_train else test_sequences
    for sequence in relevant_sequences:
        split_sequences = transformer._split_sequence(sequence)
        for split_sequence in split_sequences:
            transformer._translate_and_pad(split_sequence, metadata)
            yield split_sequence.x_vecs_stacked, split_sequence.y_vec

def generate_train():
    return generate(for_train=True)

def generate_test():
    return generate(for_train=False)

In [7]:
train_dataset = (
    tf.data.Dataset.from_generator(
        generate_train,
        output_types=(tf.float32, tf.float32),
    )
    .shuffle(
        experiment_config.dataset_shuffle_buffer,
        seed=experiment_config.dataset_shuffle_seed,
        reshuffle_each_iteration=True,
    )
    .batch(experiment_config.batch_size)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

test_dataset = (
    tf.data.Dataset.from_generator(
        generate_test,
        output_types=(tf.float32, tf.float32),
    )
    .batch(experiment_config.batch_size)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

2022-05-30 09:05:35.408383: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-30 09:05:35.408404: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-30 09:05:35.408419: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (hj-ubuntu): /proc/driver/nvidia/version does not exist
2022-05-30 09:05:35.408622: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-05-30 09:05:35.436155: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2599990000 Hz
2022-05-30 09:05:35.437882: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7efe74000b60 initialized for platform Host (this does not guarantee that 

## Generate text

In [9]:
description_preprocessor = preprocessing.ConcurrentAggregatedLogsDescriptionPreprocessor(
    config = huawei_preprocessor_config
)

pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None) 
pd.set_option('max_colwidth',100)

### Step 1 Load data 

In [10]:
# Load log-only data 
huawei_df = sequence_preprocessor._load_log_only_data()

  exec(code_obj, self.user_global_ns, self.user_ns)
Generating DRAIN clusters from log_df: 100%|███████████████| 152373/152373 [00:20<00:00, 7340.19it/s]
Generating DRAIN clusters from log_df: 100%|███████████████| 152373/152373 [00:16<00:00, 9400.83it/s]
Generating DRAIN clusters from log_df: 100%|████████████████████| 970/970 [00:00<00:00, 14921.86it/s]


In [11]:
'''
def _load_column_descriptions(
    self, huawei_df: pd.DataFrame, relevant_columns: Set[str]
) -> pd.DataFrame:
    http_descriptions = self._load_http_descriptions()
    column_descriptions = self._get_column_descriptions()
    description_records = []
    for column in relevant_columns:
        values = set(
            huawei_df[column].dropna().astype(str).replace(np.nan, "", regex=True)
        )
        values = set([str(x).lower() for x in values if len(str(x)) > 0])
        for value in tqdm(values, desc="Loading descriptions for column " + column):
            description = ""
            if column == "Hostname":
                name = value.rstrip("0123456789")
                number = value[len(name) :]
                description = name + " " + number
            elif column == "http_status":
                description = http_descriptions[value]
            else:
                description = " ".join(re.split("[,._\-\*]+", value))

            if column in column_descriptions:
                description = column_descriptions[column] + " " + description

            description_records.append(
                {"label": column + "#" + value, "description": description,},
            )

    return (
        pd.DataFrame.from_records(description_records)
        .drop_duplicates()
        .reset_index(drop=True)
    )
'''

description_df = description_preprocessor._load_column_descriptions(huawei_df, sequence_preprocessor.relevant_columns)

Loading descriptions for column python_module: 100%|█████████████| 95/95 [00:00<00:00, 212511.40it/s]
Loading descriptions for column log_level: 100%|████████████████████| 3/3 [00:00<00:00, 64860.37it/s]
Loading descriptions for column coarse_log_cluster_template: 100%|█| 482/482 [00:00<00:00, 71192.54it
Loading descriptions for column fine_log_cluster_template: 100%|█| 1446/1446 [00:00<00:00, 156753.86i
Loading descriptions for column http_status: 100%|███████████████| 11/11 [00:00<00:00, 167772.16it/s]
Loading descriptions for column programname: 100%|███████████████| 24/24 [00:00<00:00, 195842.99it/s]
Loading descriptions for column http_method: 100%|██████████████████| 4/4 [00:00<00:00, 32078.81it/s]
Loading descriptions for column url_cluster_template: 100%|██████| 34/34 [00:00<00:00, 265067.54it/s]
Loading descriptions for column Hostname: 100%|█████████████████████| 5/5 [00:00<00:00, 95760.37it/s]


In [12]:
description_df

Unnamed: 0,label,description
0,python_module#heat.engine.environment,Python module heat engine environment
1,python_module#neutron.common.config,Python module neutron common config
2,python_module#os_brick.initiator.connectors.remotefs,Python module os brick initiator connectors remotefs
3,python_module#nova.compute.api,Python module nova compute api
4,python_module#glance.api.v2.images,Python module glance api v2 images
5,python_module#neutron.plugins.ml2.drivers.type_flat,Python module neutron plugins ml2 drivers type flat
6,python_module#neutron.plugins.ml2.drivers.l2pop.mech_driver,Python module neutron plugins ml2 drivers l2pop mech driver
7,python_module#cinder.rpc,Python module cinder rpc
8,python_module#glance.common.wsgi,Python module glance common wsgi
9,python_module#neutron.db.l3_agentschedulers_db,Python module neutron db l3 agentschedulers db


### Step 2 Build knowledge from daraframe

In [13]:
description_knowledge = knowledge.DescriptionKnowledge(
    config=knowledge_config,
)

description_knowledge.build_knowledge_from_df(
    description_df, metadata.x_vocab
)

Preprocessing description words: 100%|████████████████████████| 2104/2104 [00:00<00:00, 10286.20it/s]


## Genarate model

In [None]:
# load model
model = models.DescriptionModel()

# build model
model.build(metadata, description_knowledge)

## Run Experiment

In [None]:
# train model
model.train_dataset(
    train_dataset,
    test_dataset,
    experiment_config.multilabel_classification,
    experiment_config.n_epochs,
)

In [None]:
# logging dataset info
mlflow.log_metric("train_size", len([x for x in train_dataset]))
mlflow.log_metric("test_size", len([x for x in test_dataset]))
mlflow.log_metric("x_vocab_size", len(metadata.x_vocab))
mlflow.log_metric("y_vocab_size", len(metadata.y_vocab))

# generate artifacts
# skip

# set mlflow tags
mlflow.set_tag("sequence_type", experiment_config.sequence_type)
mlflow.set_tag("model_type", experiment_config.model_type)
if len(metadata.y_vocab) == 1:
    mlflow.set_tag("task_type", "risk_prediction")
else:
    mlflow.set_tag("task_type", "sequence_prediction")

logging.info("Finished run %s", run_id)