In [23]:
import logging
import tensorflow as tf
import mlflow
import random
import pandas as pd
import numpy as np

import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

sys.argv.clear()
sys.argv.append("")

In [2]:
from src import config
from src.features import preprocessing,sequences,knowledge
from src.training import models
from src import refinement

# four types of knowledge for Huawei:
# gram, text, causal, log template(gram_logs)
experiment_config = config.ExperimentConfig()
experiment_config.model_type = "gram"
experiment_config.sequence_type = "huawei_logs"

model_config = models.config.ModelConfig()
model_config.rnn_type: str = "gru"

# keep all other default configurations 
huawei_preprocessor_config = preprocessing.huawei.HuaweiPreprocessorConfig()
sequence_config = sequences.config.SequenceConfig()
knowledge_config = knowledge.config.KnowledgeConfig()
refinement_config = refinement.config.RefinementConfig()

In [3]:
def log_all_configs_to_mlflow():
    for config in [
        experiment_config,
        huawei_preprocessor_config,
        sequence_config,
        model_config,
        knowledge_config,
        refinement_config,
    ]:
        for config_name, config_value in vars(config).items():
            full_config_name = config.__class__.__name__ + config_name
            mlflow.log_param(full_config_name, str(config_value))

In [4]:
mlflow.set_experiment("Domain Guided Monitoring")
with mlflow.start_run() as run:
    run_id = run.info.run_id
logging.info("Starting run %s", run_id)
tf.random.set_seed(experiment_config.tensorflow_seed)
random.seed(experiment_config.random_seed)

## Load Huawei sequences

In [5]:
sequence_preprocessor = preprocessing.ConcurrentAggregatedLogsPreprocessor(
    huawei_preprocessor_config,
)
sequence_column_name = sequence_preprocessor.sequence_column_name
sequence_df = sequence_preprocessor.load_data()

transformer = sequences.transformer.NextPartialSequenceTransformerFromDataframe(sequence_config)
metadata = transformer.collect_metadata(sequence_df, sequence_column_name)

  sequence_df = sequence_preprocessor.load_data()
Generating DRAIN clusters from log_df: 100%|███████████████| 152373/152373 [00:20<00:00, 7261.11it/s]
Generating DRAIN clusters from log_df: 100%|███████████████| 152373/152373 [00:15<00:00, 9772.51it/s]
Generating DRAIN clusters from log_df: 100%|████████████████████| 970/970 [00:00<00:00, 14919.18it/s]


In [9]:
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None) 
pd.set_option('max_colwidth',100)

sequence_df

Unnamed: 0,num_logs,num_events,all_events,attributes,coarse_log_cluster_template,fine_log_cluster_template
0,169230,169230,"[[Hostname#wally113, url_cluster_template#resource providers * allocations, , coarse_log_cluster...","[[Hostname#wally113, url_cluster_template#resource providers * allocations, , , http_status#200....",[[coarse_log_cluster_template#25 nov 2019 * * * 0100 * * * * * * * * * * * keystoneauth1 3131 py...,[[fine_log_cluster_template#25 nov 2019 15 * * 0100 get resource providers * * http 11 200 * * n...


## Generate dataset

In [10]:
sequence_df_pkl_file: str = "data/sequences_df.pkl"
sequence_df.to_pickle(sequence_df_pkl_file)

train_sequences, test_sequences = transformer._split_train_test(sequence_df, sequence_column_name)

def generate(for_train):
    relevant_sequences = train_sequences if for_train else test_sequences
    for sequence in relevant_sequences:
        split_sequences = transformer._split_sequence(sequence)
        for split_sequence in split_sequences:
            transformer._translate_and_pad(split_sequence, metadata)
            yield split_sequence.x_vecs_stacked, split_sequence.y_vec

def generate_train():
    return generate(for_train=True)

def generate_test():
    return generate(for_train=False)

In [11]:
train_dataset = (
    tf.data.Dataset.from_generator(
        generate_train,
        output_types=(tf.float32, tf.float32),
    )
    .shuffle(
        experiment_config.dataset_shuffle_buffer,
        seed=experiment_config.dataset_shuffle_seed,
        reshuffle_each_iteration=True,
    )
    .batch(experiment_config.batch_size)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

test_dataset = (
    tf.data.Dataset.from_generator(
        generate_test,
        output_types=(tf.float32, tf.float32),
    )
    .batch(experiment_config.batch_size)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

2022-05-30 07:15:17.088286: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-30 07:15:17.088320: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-30 07:15:17.088348: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (hj-ubuntu): /proc/driver/nvidia/version does not exist
2022-05-30 07:15:17.088750: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-05-30 07:15:17.120116: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2599990000 Hz
2022-05-30 07:15:17.121816: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f2168000b60 initialized for platform Host (this does not guarantee that 

## Generate hierarchy 

In [12]:
hierarchy_preprocessor = preprocessing.ConcurrentAggregatedLogsHierarchyPreprocessor(
    config = huawei_preprocessor_config
)

### Step 1 Load log-only data 

### 1.1 Read log to dataframe

In [13]:
'''
def _read_log_df(self) -> pd.DataFrame:
    df = (
        pd.read_csv(self.config.aggregated_log_file)
        .fillna("")
        .astype(str)
        .replace(np.nan, "", regex=True)
    )
    rel_df = df[
        self.config.relevant_aggregated_log_columns
        + [self.config.log_datetime_column_name]
        + [self.config.log_payload_column_name]
        + [self.config.url_column_name]
    ]
    rel_df = self._add_log_drain_clusters(rel_df)
    if self.config.log_template_file.exists():
        rel_df = self._add_precalculated_log_templates(rel_df)
    rel_df["timestamp"] = pd.to_datetime(
        rel_df[self.config.log_datetime_column_name]
    )
    return rel_df

def _add_log_drain_clusters(self, log_df: pd.DataFrame) -> pd.DataFrame:
    log_result_df = self._add_log_drain_clusters_prefix(
        log_df=log_df,
        depth=self.config.fine_drain_log_depth,
        st=self.config.fine_drain_log_st,
        prefix="fine_",
    )
    log_result_df = self._add_log_drain_clusters_prefix(
        log_df=log_result_df,
        depth=self.config.coarse_drain_log_depth,
        st=self.config.coarse_drain_log_st,
        prefix="coarse_",
    )
    for i in range(len(self.config.drain_log_depths)):
        log_result_df = self._add_log_drain_clusters_prefix(
            log_df=log_result_df,
            depth=self.config.drain_log_depths[i],
            st=self.config.drain_log_sts[i],
            prefix=str(i) + "_",
        )
    return log_result_df


def _add_log_drain_clusters_prefix(
    self, log_df: pd.DataFrame, depth: int, st: float, prefix: str
) -> pd.DataFrame:
    all_logs_df = pd.DataFrame(
        log_df[self.config.log_payload_column_name].dropna().drop_duplicates()
    )
    drain = Drain(
        DrainParameters(
            depth=depth,
            st=st,
            rex=[
                ("(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)", ""),
                (self.request_drain_regex, " "),
                ("[^a-zA-Z\d\s:]", ""),
            ],
        ),
        data_df=all_logs_df,
        data_df_column_name=self.config.log_payload_column_name,
    )
    drain_result_df = drain.load_data().drop_duplicates().set_index("log_idx")
    log_result_df = (
        pd.merge(
            log_df,
            pd.merge(
                all_logs_df,
                drain_result_df,
                left_index=True,
                right_index=True,
                how="left",
            )
            .drop_duplicates()
            .reset_index(drop=True),
            on=self.config.log_payload_column_name,
            how="left",
        )
        .rename(
            columns={
                "cluster_template": prefix + "log_cluster_template",
                "cluster_path": prefix + "log_cluster_path",
            }
        )
        .drop(columns=["cluster_id"])
    )
    log_result_df[prefix + "log_cluster_template"] = (
        log_result_df[prefix + "log_cluster_template"]
        .fillna("")
        .astype(str)
        .replace(np.nan, "", regex=True)
    )
    return log_result_df

'''

huawei_df = sequence_preprocessor._read_log_df()

  if (await self.run_code(code, result,  async_=asy)):
Generating DRAIN clusters from log_df: 100%|███████████████| 152373/152373 [00:20<00:00, 7441.39it/s]
Generating DRAIN clusters from log_df: 100%|███████████████| 152373/152373 [00:15<00:00, 9729.12it/s]


In [15]:
huawei_df.head()

Unnamed: 0,Hostname,log_level,programname,python_module,http_status,http_method,@timestamp,Payload,http_url,fine_log_cluster_template,fine_log_cluster_path,coarse_log_cluster_template,coarse_log_cluster_path,timestamp
0,wally113,INFO,neutron-openvswitch-agent,neutron.agent.securitygroups_rpc,,,2019-11-25T15:01:49.683000000+01:00,Security group rule updated ['78c81907-a991-4109-91df-57258ed05d42'],,security group rule updated *,5 security group rule updated *,security group rule updated *,5 security group rule,2019-11-25 15:01:49.683000+01:00
1,wally113,INFO,neutron-openvswitch-agent,neutron.agent.securitygroups_rpc,,,2019-11-25T15:01:49.877000000+01:00,Security group rule updated ['78c81907-a991-4109-91df-57258ed05d42'],,security group rule updated *,5 security group rule updated *,security group rule updated *,5 security group rule,2019-11-25 15:01:49.877000+01:00
2,wally113,WARNING,keystone,keystone.server.flask.application,,,2019-11-25T15:01:52.744000000+01:00,Could not find domain: Default.: DomainNotFound: Could not find domain: Default.,,could not find domain default domainnotfound could not find domain default,11 could not find domain default domainnotfound could not find,could not find domain default domainnotfound could not find domain default,11 could not find,2019-11-25 15:01:52.744000+01:00
3,wally113,INFO,glance-api,eventlet.wsgi.server,,,2019-11-25T15:01:48.510000000+01:00,"172.17.0.2 - - [25/Nov/2019 15:01:48] ""GET /v2/schemas/image HTTP/1.1"" 200 5562 0.002795",,25 nov 2019 * 01 * get v2 schemas image http 11 200 5562 *,15 * nov * * * * get v2 schemas,25 nov 2019 * * * * * * * * * * * *,15 * nov *,2019-11-25 15:01:48.510000+01:00
4,wally113,INFO,nova-api,nova.api.openstack,,,2019-11-25T15:01:56.520000000+01:00,http://130.149.249.123:8774/v2.1/dbf4ab7d6e84449e93e02f305534200b/servers returned with HTTP 500,,http v21 * servers returned with http 500,8 http v21 * servers returned with http *,http v21 * servers returned with http 500,8 http v21 *,2019-11-25 15:01:56.520000+01:00


### 1.2 Add url drain clusters

In [16]:
'''
def _add_url_drain_clusters(self, df: pd.DataFrame) -> pd.DataFrame:
    url_df = pd.DataFrame(
        df[self.config.url_column_name].dropna().drop_duplicates()
    )
    drain = Drain(
        DrainParameters(
            depth=self.config.drain_url_depth,
            st=self.config.drain_url_st,
            rex=[(self.request_drain_regex, " "),],
        ),
        data_df=url_df,
        data_df_column_name=self.config.url_column_name,
    )
    drain_result_df = (
        drain.load_data().drop_duplicates(ignore_index=False).set_index("log_idx")
    )
    url_result_df = (
        pd.merge(
            df,
            pd.merge(
                url_df,
                drain_result_df,
                left_index=True,
                right_index=True,
                how="left",
            )
            .drop_duplicates()
            .reset_index(drop=True),
            on=self.config.url_column_name,
            how="left",
        )
        .rename(
            columns={
                "cluster_template": "url_cluster_template",
                "cluster_path": "url_cluster_path",
            }
        )
        .drop(columns=["cluster_id"])
    )
    url_result_df["url_cluster_template"] = (
        url_result_df["url_cluster_template"]
        .fillna("")
        .astype(str)
        .replace(np.nan, "", regex=True)
    )
    return url_result_df
'''

huawei_df = sequence_preprocessor._add_url_drain_clusters(huawei_df)

Generating DRAIN clusters from log_df: 100%|████████████████████| 970/970 [00:00<00:00, 11959.63it/s]


In [19]:
huawei_df.head(100)

Unnamed: 0,Hostname,log_level,programname,python_module,http_status,http_method,@timestamp,Payload,http_url,fine_log_cluster_template,fine_log_cluster_path,coarse_log_cluster_template,coarse_log_cluster_path,timestamp,url_cluster_template,url_cluster_path
0,wally113,INFO,neutron-openvswitch-agent,neutron.agent.securitygroups_rpc,,,2019-11-25T15:01:49.683000000+01:00,Security group rule updated ['78c81907-a991-4109-91df-57258ed05d42'],,security group rule updated *,5 security group rule updated *,security group rule updated *,5 security group rule,2019-11-25 15:01:49.683000+01:00,,
1,wally113,INFO,neutron-openvswitch-agent,neutron.agent.securitygroups_rpc,,,2019-11-25T15:01:49.877000000+01:00,Security group rule updated ['78c81907-a991-4109-91df-57258ed05d42'],,security group rule updated *,5 security group rule updated *,security group rule updated *,5 security group rule,2019-11-25 15:01:49.877000+01:00,,
2,wally113,WARNING,keystone,keystone.server.flask.application,,,2019-11-25T15:01:52.744000000+01:00,Could not find domain: Default.: DomainNotFound: Could not find domain: Default.,,could not find domain default domainnotfound could not find domain default,11 could not find domain default domainnotfound could not find,could not find domain default domainnotfound could not find domain default,11 could not find,2019-11-25 15:01:52.744000+01:00,,
3,wally113,INFO,glance-api,eventlet.wsgi.server,,,2019-11-25T15:01:48.510000000+01:00,"172.17.0.2 - - [25/Nov/2019 15:01:48] ""GET /v2/schemas/image HTTP/1.1"" 200 5562 0.002795",,25 nov 2019 * 01 * get v2 schemas image http 11 200 5562 *,15 * nov * * * * get v2 schemas,25 nov 2019 * * * * * * * * * * * *,15 * nov *,2019-11-25 15:01:48.510000+01:00,,
4,wally113,INFO,nova-api,nova.api.openstack,,,2019-11-25T15:01:56.520000000+01:00,http://130.149.249.123:8774/v2.1/dbf4ab7d6e84449e93e02f305534200b/servers returned with HTTP 500,,http v21 * servers returned with http 500,8 http v21 * servers returned with http *,http v21 * servers returned with http 500,8 http v21 *,2019-11-25 15:01:56.520000+01:00,,
5,wally113,ERROR,nova-api,nova.api.openstack.wsgi,,,2019-11-25T15:01:56.964000000+01:00,Unexpected exception in API method: HTTPInternalServerError: HTTP HTTPInternalServerError\n2019-...,,unexpected exception in api method httpinternalservererror http httpinternalservererror 20191125...,1115 unexpected exception in api method httpinternalservererror http httpinternalservererror *,unexpected exception in api method httpinternalservererror http httpinternalservererror 20191125...,1115 unexpected exception in,2019-11-25 15:01:56.964000+01:00,,
6,wally113,INFO,nova-api,nova.api.openstack.wsgi,,,2019-11-25T15:01:57.227000000+01:00,HTTP exception thrown: Unexpected API Error. Please report this at http://bugs.launchpad.net/nov...,,http exception thrown unexpected api error please report this at http bugslaunchpadnet nova and ...,23 http exception thrown unexpected api error please report this,http exception thrown unexpected api error please report this at http bugslaunchpadnet nova and ...,23 http exception thrown,2019-11-25 15:01:57.227000+01:00,,
7,wally113,INFO,nova-api,nova.osapi_compute.wsgi.server,,,2019-11-25T15:01:57.228000000+01:00,"172.17.0.2 ""POST /v2.1/dbf4ab7d6e84449e93e02f305534200b/servers HTTP/1.1"" status: 500 len: 647 t...",,post v21 * servers http 11 status 500 len * time *,12 post v21 * servers http * status * len,post v21 * servers http 11 status * len * time *,12 post v21 *,2019-11-25 15:01:57.228000+01:00,,
8,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-25T15:01:49.842000000+01:00,"172.17.0.2 ""DELETE /v2.0/security-groups/78c81907-a991-4109-91df-57258ed05d42 HTTP/1.1"" status: ...",,delete v20 securitygroups * http 11 status 204 len 173 time *,12 delete v20 securitygroups * http * status * len,delete v20 securitygroups * http 11 status 204 len 173 time *,12 delete v20 securitygroups,2019-11-25 15:01:49.842000+01:00,,
9,wally113,INFO,nova-api,nova.api.openstack.wsgi,,,2019-11-25T15:01:57.882000000+01:00,HTTP exception thrown: Unexpected API Error. Please report this at http://bugs.launchpad.net/nov...,,http exception thrown unexpected api error please report this at http bugslaunchpadnet nova and ...,23 http exception thrown unexpected api error please report this,http exception thrown unexpected api error please report this at http bugslaunchpadnet nova and ...,23 http exception thrown,2019-11-25 15:01:57.882000+01:00,,


### 1.3 Finish loading log-only data 

In [24]:
'''
def _load_log_only_data(self) -> pd.DataFrame:
    log_df = self._read_log_df()
    log_df = self._add_url_drain_clusters(log_df)
    for column in [x for x in log_df.columns if "log_cluster_template" in x]:
        log_df[column] = (
            log_df[column]
            .fillna("")
            .astype(str)
            .replace(np.nan, "", regex=True)
            .apply(lambda x: x if len(x) > 0 else "___empty___")
        )
    return log_df
'''
#huawei_df = sequence_preprocessor._load_log_only_data()  

for column in [x for x in huawei_df.columns if "log_cluster_template" in x]:
    huawei_df[column] = (
        huawei_df[column]
        .fillna("")
        .astype(str)
        .replace(np.nan, "", regex=True)
        .apply(lambda x: x if len(x) > 0 else "___empty___")
    )


In [25]:
huawei_df.head(100)

Unnamed: 0,Hostname,log_level,programname,python_module,http_status,http_method,@timestamp,Payload,http_url,fine_log_cluster_template,fine_log_cluster_path,coarse_log_cluster_template,coarse_log_cluster_path,timestamp,url_cluster_template,url_cluster_path
0,wally113,INFO,neutron-openvswitch-agent,neutron.agent.securitygroups_rpc,,,2019-11-25T15:01:49.683000000+01:00,Security group rule updated ['78c81907-a991-4109-91df-57258ed05d42'],,security group rule updated *,5 security group rule updated *,security group rule updated *,5 security group rule,2019-11-25 15:01:49.683000+01:00,,
1,wally113,INFO,neutron-openvswitch-agent,neutron.agent.securitygroups_rpc,,,2019-11-25T15:01:49.877000000+01:00,Security group rule updated ['78c81907-a991-4109-91df-57258ed05d42'],,security group rule updated *,5 security group rule updated *,security group rule updated *,5 security group rule,2019-11-25 15:01:49.877000+01:00,,
2,wally113,WARNING,keystone,keystone.server.flask.application,,,2019-11-25T15:01:52.744000000+01:00,Could not find domain: Default.: DomainNotFound: Could not find domain: Default.,,could not find domain default domainnotfound could not find domain default,11 could not find domain default domainnotfound could not find,could not find domain default domainnotfound could not find domain default,11 could not find,2019-11-25 15:01:52.744000+01:00,,
3,wally113,INFO,glance-api,eventlet.wsgi.server,,,2019-11-25T15:01:48.510000000+01:00,"172.17.0.2 - - [25/Nov/2019 15:01:48] ""GET /v2/schemas/image HTTP/1.1"" 200 5562 0.002795",,25 nov 2019 * 01 * get v2 schemas image http 11 200 5562 *,15 * nov * * * * get v2 schemas,25 nov 2019 * * * * * * * * * * * *,15 * nov *,2019-11-25 15:01:48.510000+01:00,,
4,wally113,INFO,nova-api,nova.api.openstack,,,2019-11-25T15:01:56.520000000+01:00,http://130.149.249.123:8774/v2.1/dbf4ab7d6e84449e93e02f305534200b/servers returned with HTTP 500,,http v21 * servers returned with http 500,8 http v21 * servers returned with http *,http v21 * servers returned with http 500,8 http v21 *,2019-11-25 15:01:56.520000+01:00,,
5,wally113,ERROR,nova-api,nova.api.openstack.wsgi,,,2019-11-25T15:01:56.964000000+01:00,Unexpected exception in API method: HTTPInternalServerError: HTTP HTTPInternalServerError\n2019-...,,unexpected exception in api method httpinternalservererror http httpinternalservererror 20191125...,1115 unexpected exception in api method httpinternalservererror http httpinternalservererror *,unexpected exception in api method httpinternalservererror http httpinternalservererror 20191125...,1115 unexpected exception in,2019-11-25 15:01:56.964000+01:00,,
6,wally113,INFO,nova-api,nova.api.openstack.wsgi,,,2019-11-25T15:01:57.227000000+01:00,HTTP exception thrown: Unexpected API Error. Please report this at http://bugs.launchpad.net/nov...,,http exception thrown unexpected api error please report this at http bugslaunchpadnet nova and ...,23 http exception thrown unexpected api error please report this,http exception thrown unexpected api error please report this at http bugslaunchpadnet nova and ...,23 http exception thrown,2019-11-25 15:01:57.227000+01:00,,
7,wally113,INFO,nova-api,nova.osapi_compute.wsgi.server,,,2019-11-25T15:01:57.228000000+01:00,"172.17.0.2 ""POST /v2.1/dbf4ab7d6e84449e93e02f305534200b/servers HTTP/1.1"" status: 500 len: 647 t...",,post v21 * servers http 11 status 500 len * time *,12 post v21 * servers http * status * len,post v21 * servers http 11 status * len * time *,12 post v21 *,2019-11-25 15:01:57.228000+01:00,,
8,wally113,INFO,neutron-server,neutron.wsgi,,,2019-11-25T15:01:49.842000000+01:00,"172.17.0.2 ""DELETE /v2.0/security-groups/78c81907-a991-4109-91df-57258ed05d42 HTTP/1.1"" status: ...",,delete v20 securitygroups * http 11 status 204 len 173 time *,12 delete v20 securitygroups * http * status * len,delete v20 securitygroups * http 11 status 204 len 173 time *,12 delete v20 securitygroups,2019-11-25 15:01:49.842000+01:00,,
9,wally113,INFO,nova-api,nova.api.openstack.wsgi,,,2019-11-25T15:01:57.882000000+01:00,HTTP exception thrown: Unexpected API Error. Please report this at http://bugs.launchpad.net/nov...,,http exception thrown unexpected api error please report this at http bugslaunchpadnet nova and ...,23 http exception thrown unexpected api error please report this,http exception thrown unexpected api error please report this at http bugslaunchpadnet nova and ...,23 http exception thrown,2019-11-25 15:01:57.882000+01:00,,


### Sep 2 Load attribute-only hierarchy

In [27]:
relevant_columns = set(
    [
        x
        for x in sequence_preprocessor.relevant_columns
        if "log_cluster_template" not in x
    ]
)

In [28]:
'''
def _load_attribute_hierarchy(
    self, huawei_df: pd.DataFrame, relevant_columns: Set[str]
) -> pd.DataFrame:
    hierarchy_df = pd.DataFrame(
        columns=["parent_id", "child_id", "parent_name", "child_name"]
    )
    for column in relevant_columns:
        hierarchy_df = hierarchy_df.append(
            {
                "parent_id": "root",
                "parent_name": "root",
                "child_id": column,
                "child_name": column,
            },
            ignore_index=True,
        )
        values = set(
            [
                str(x).lower()
                for x in huawei_df[column]
                .dropna()
                .astype(str)
                .replace(np.nan, "", regex=True)
                if len(str(x)) > 0 and str(x).lower() != "nan"
            ]
        )
        for value in tqdm(values, desc="Loading hierarchy for column " + column):
            hierarchy_elements = [column]
            if column == "Hostname":
                hierarchy_elements.append(value.rstrip("0123456789"))
            elif column == "http_status":
                hierarchy_elements.append(value[0] + "00")
            elif "cluster" in column:
                hierarchy_elements = hierarchy_elements + value.split()
            else:
                hierarchy_elements = hierarchy_elements + re.split(
                    "[,._\-\*]+", value
                )
                hierarchy_elements = [
                    x.strip() for x in hierarchy_elements if len(x.strip()) > 0
                ]
            if hierarchy_elements[len(hierarchy_elements) - 1] == value:
                hierarchy_elements = hierarchy_elements[
                    : len(hierarchy_elements) - 1
                ]

            hierarchy = []
            for i in range(1, len(hierarchy_elements) + 1):
                hierarchy.append("->".join(hierarchy_elements[0:i]))
            hierarchy.append(column + "#" + value)

            parent_id = column
            parent_name = column
            for i in range(len(hierarchy)):
                child_id = hierarchy[i]
                child_name = child_id.split("->")[-1]
                if not parent_id == child_id:
                    hierarchy_df = hierarchy_df.append(
                        {
                            "parent_id": parent_id,
                            "parent_name": parent_name,
                            "child_id": child_id,
                            "child_name": child_name,
                        },
                        ignore_index=True,
                    )
                parent_id = child_id
                parent_name = child_name

    return hierarchy_df[["parent_id", "child_id", "parent_name", "child_name"]]
'''
    
attribute_hierarchy = hierarchy_preprocessor._load_attribute_hierarchy(
    huawei_df, relevant_columns
)

Loading hierarchy for column Hostname: 100%|██████████████████████████| 5/5 [00:00<00:00, 293.72it/s]
Loading hierarchy for column url_cluster_template: 100%|████████████| 34/34 [00:00<00:00, 125.47it/s]
Loading hierarchy for column log_level: 100%|█████████████████████████| 3/3 [00:00<00:00, 381.27it/s]
Loading hierarchy for column python_module: 100%|███████████████████| 95/95 [00:00<00:00, 114.21it/s]
Loading hierarchy for column http_status: 100%|█████████████████████| 11/11 [00:00<00:00, 281.76it/s]
Loading hierarchy for column http_method: 100%|███████████████████████| 4/4 [00:00<00:00, 542.92it/s]
Loading hierarchy for column programname: 100%|█████████████████████| 24/24 [00:00<00:00, 176.79it/s]


In [30]:
attribute_hierarchy

Unnamed: 0,parent_id,child_id,parent_name,child_name
0,root,Hostname,root,Hostname
1,Hostname,Hostname->wally,Hostname,wally
2,Hostname->wally,Hostname#wally113,wally,Hostname#wally113
3,Hostname,Hostname->wally,Hostname,wally
4,Hostname->wally,Hostname#wally122,wally,Hostname#wally122
5,Hostname,Hostname->wally,Hostname,wally
6,Hostname->wally,Hostname#wally124,wally,Hostname#wally124
7,Hostname,Hostname->wally,Hostname,wally
8,Hostname->wally,Hostname#wally117,wally,Hostname#wally117
9,Hostname,Hostname->wally,Hostname,wally


In [35]:
'''
def _load_log_hierarchy(
    self, huawei_df: pd.DataFrame, relevant_columns: Set[str]
) -> pd.DataFrame:
    hierarchy_records = []
    for _, row in tqdm(
        huawei_df.iterrows(),
        desc="Adding huawei log hierarchy",
        total=len(huawei_df),
    ):
        log_template = str(row[self.config.relevant_log_column]).lower()
        for column in relevant_columns:
            row_value = (
                column + "#" + str(row[column]).lower()
                if len(str(row[column])) > 0
                else ""
            )
            if len(row_value) == 0:
                continue

            hierarchy_records.append(
                {
                    "parent_id": row_value,
                    "parent_name": row_value.split("#")[1],
                    "child_id": self.config.relevant_log_column + "#" + log_template,
                    "child_name": log_template,
                },
            )
    return (
        pd.DataFrame.from_records(hierarchy_records)
        .drop_duplicates()
        .reset_index(drop=True)
    )
'''

attribute_hierarchy.append(
    hierarchy_preprocessor._load_log_hierarchy(huawei_df, relevant_columns),
    ignore_index=True,
).drop_duplicates().reset_index(drop=True)

Adding huawei log hierarchy: 100%|████████████████████████| 169230/169230 [00:16<00:00, 10561.93it/s]


Unnamed: 0,parent_id,child_id,parent_name,child_name
0,root,Hostname,root,Hostname
1,Hostname,Hostname->wally,Hostname,wally
2,Hostname->wally,Hostname#wally113,wally,Hostname#wally113
3,Hostname->wally,Hostname#wally122,wally,Hostname#wally122
4,Hostname->wally,Hostname#wally124,wally,Hostname#wally124
5,Hostname->wally,Hostname#wally117,wally,Hostname#wally117
6,Hostname->wally,Hostname#wally123,wally,Hostname#wally123
7,root,url_cluster_template,root,url_cluster_template
8,url_cluster_template,url_cluster_template->v3,url_cluster_template,v3
9,url_cluster_template->v3,url_cluster_template->v3->fb6b46e184a14d519f0a3e6013967188,v3,fb6b46e184a14d519f0a3e6013967188


In [36]:
attribute_hierarchy

Unnamed: 0,parent_id,child_id,parent_name,child_name
0,root,Hostname,root,Hostname
1,Hostname,Hostname->wally,Hostname,wally
2,Hostname->wally,Hostname#wally113,wally,Hostname#wally113
3,Hostname,Hostname->wally,Hostname,wally
4,Hostname->wally,Hostname#wally122,wally,Hostname#wally122
5,Hostname,Hostname->wally,Hostname,wally
6,Hostname->wally,Hostname#wally124,wally,Hostname#wally124
7,Hostname,Hostname->wally,Hostname,wally
8,Hostname->wally,Hostname#wally117,wally,Hostname#wally117
9,Hostname,Hostname->wally,Hostname,wally


### Step 3 Build hierarchy from daraframe

In [37]:
'''
def build_hierarchy_from_df(
    self, hierarchy_df: pd.DataFrame, vocab: Dict[str, int]
):
    self.vocab: Dict[str, int] = vocab
    self._build_extended_vocab(hierarchy_df, vocab)
    for _, row in tqdm(hierarchy_df.iterrows(), desc="Building Hierarchy from df"):
        child_id = row[self.child_id_col]
        if child_id not in self.extended_vocab:
            logging.debug("Ignoring node %s as not in dataset", child_id)
            continue

        child_node = self.nodes[self.extended_vocab[child_id]]
        parent_node = self.nodes[self.extended_vocab[row[self.parent_id_col]]]

        if child_node is not parent_node:
            child_node.in_nodes.add(parent_node)
            parent_node.out_nodes.add(child_node)

    logging.info("Built hierarchy with %d nodes", len(self.nodes))
'''

hierarchy = knowledge.HierarchyKnowledge(
    config=knowledge_config,
)
hierarchy.build_hierarchy_from_df(attribute_hierarchy, metadata.x_vocab)

Building Hierarchy from df: 743it [00:00, 15962.71it/s]


## Genarate model

In [None]:
# load model
model = models.GramModel()

# build model
model.build(metadata, hierarchy)

## Run Experiment

In [None]:
# train model
model.train_dataset(
    train_dataset,
    test_dataset,
    experiment_config.multilabel_classification,
    experiment_config.n_epochs,
)

In [None]:
# logging dataset info
mlflow.log_metric("train_size", len([x for x in train_dataset]))
mlflow.log_metric("test_size", len([x for x in test_dataset]))
mlflow.log_metric("x_vocab_size", len(metadata.x_vocab))
mlflow.log_metric("y_vocab_size", len(metadata.y_vocab))

# generate artifacts
# skip

# set mlflow tags
mlflow.set_tag("sequence_type", experiment_config.sequence_type)
mlflow.set_tag("model_type", experiment_config.model_type)
if len(metadata.y_vocab) == 1:
    mlflow.set_tag("task_type", "risk_prediction")
else:
    mlflow.set_tag("task_type", "sequence_prediction")

logging.info("Finished run %s", run_id)