In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
from tqdm import tqdm
import numpy as np
import pickle
from torch_geometric.data import Data, DataLoader
import torch
from joblib import Parallel, delayed
import networkx as nx
import math
import optuna

import sys
sys.path.append('../src')
from incident_diagnosis.incident_diagnosis import root_cause_localization, explain, optimize, get_weight_from_edge_info

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('../data/OB.pkl', 'rb') as f:
    incident_topologies = pickle.load(f)

In [3]:
len(incident_topologies)

210

In [4]:
import logging

# 设置日志级别为 WARNING，这将只打印警告和错误信息，不打印详细的优化过程信息
optuna.logging.set_verbosity(optuna.logging.WARNING)
init_clue_tag = 'AlertCount'
node_clue_tags=['AlertCount']
edge_clue_tags=[]
a = {}
for clue_tag in edge_clue_tags:
    a[clue_tag] = 1
for clue_tag in node_clue_tags:
    a[clue_tag] = 1
get_edge_weight=None
edge_backward_factor=0.3
for i, case in enumerate(incident_topologies):
    
    case['pred_incremental'] = root_cause_localization(case, node_clue_tags, edge_clue_tags, a, get_edge_weight, edge_backward_factor)

    if case['root_cause'] not in case['pred_incremental'] and case['pred_incremental'] != 'None':
        #optimize
        node_clue_tags, a = optimize(case, node_clue_tags, edge_clue_tags, a, get_edge_weight, edge_backward_factor, incident_topologies[:i+1], init_clue_tag, num_trials=10)

100%|██████████| 10/10 [00:00<00:00, 62.39it/s]


best trial: FrozenTrial(number=4, state=TrialState.COMPLETE, values=[0.75, 5.347744562027804], datetime_start=datetime.datetime(2024, 12, 20, 12, 51, 14, 190861), datetime_complete=datetime.datetime(2024, 12, 20, 12, 51, 14, 204848), params={'a:AlertCount': 2.1182739966945237, 'a:kpi_container_network_transmit_packets': 3.2294705653332807}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'a:AlertCount': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:kpi_container_network_transmit_packets': FloatDistribution(high=5.0, log=False, low=0.0, step=None)}, trial_id=4, value=None)
A better solution found
a: {'AlertCount': 2.1182739966945237, 'kpi_container_network_transmit_packets': 3.2294705653332807}


100%|██████████| 10/10 [00:00<00:00, 38.10it/s]


A better solution found


100%|██████████| 10/10 [00:00<00:00, 11.23it/s]




100%|██████████| 10/10 [00:01<00:00,  5.35it/s]




100%|██████████| 10/10 [00:02<00:00,  3.87it/s]




100%|██████████| 10/10 [00:02<00:00,  3.34it/s]




100%|██████████| 10/10 [00:04<00:00,  2.36it/s]




100%|██████████| 10/10 [00:05<00:00,  1.89it/s]




100%|██████████| 10/10 [00:06<00:00,  1.63it/s]




100%|██████████| 10/10 [00:06<00:00,  1.55it/s]




100%|██████████| 10/10 [00:06<00:00,  1.48it/s]




100%|██████████| 10/10 [00:08<00:00,  1.16it/s]




100%|██████████| 10/10 [00:09<00:00,  1.10it/s]




100%|██████████| 10/10 [00:09<00:00,  1.08it/s]




100%|██████████| 10/10 [00:12<00:00,  1.21s/it]




100%|██████████| 10/10 [00:12<00:00,  1.28s/it]




100%|██████████| 10/10 [00:15<00:00,  1.54s/it]




100%|██████████| 10/10 [00:15<00:00,  1.57s/it]




100%|██████████| 10/10 [00:18<00:00,  1.82s/it]




100%|██████████| 10/10 [00:20<00:00,  2.05s/it]




100%|██████████| 10/10 [00:21<00:00,  2.16s/it]




100%|██████████| 10/10 [00:25<00:00,  2.54s/it]




100%|██████████| 10/10 [00:28<00:00,  2.88s/it]




100%|██████████| 10/10 [00:29<00:00,  2.92s/it]




100%|██████████| 10/10 [00:31<00:00,  3.15s/it]




100%|██████████| 10/10 [00:35<00:00,  3.54s/it]




100%|██████████| 10/10 [00:36<00:00,  3.63s/it]




100%|██████████| 10/10 [00:39<00:00,  3.96s/it]




100%|██████████| 10/10 [00:40<00:00,  4.03s/it]




100%|██████████| 10/10 [00:42<00:00,  4.23s/it]




100%|██████████| 10/10 [00:45<00:00,  4.58s/it]




100%|██████████| 10/10 [00:49<00:00,  4.92s/it]






In [5]:
for case in incident_topologies:
    case['right_incremental'] = case['root_cause'] in case['pred_incremental']


In [6]:
test_target = 'incremental'
begin_to_test_ratio = 0
summary = pd.Series([case['right_'+test_target] for case in incident_topologies[int(begin_to_test_ratio*len(incident_topologies)):]]).value_counts()
summary['A@1'] = summary[True]/(summary[True]+summary[False])
print(summary)

True     174.000000
False     36.000000
A@1        0.828571
Name: count, dtype: float64


In [7]:
test_target = 'incremental'
begin_to_test_ratio = 0.3
summary = pd.Series([case['right_'+test_target] for case in incident_topologies[int(begin_to_test_ratio*len(incident_topologies)):]]).value_counts()
summary['A@1'] = summary[True]/(summary[True]+summary[False])
print(summary)

True     126.000000
False     21.000000
A@1        0.857143
Name: count, dtype: float64


In [8]:
test_target = 'incremental'
begin_to_test_ratio = 0.6
summary = pd.Series([case['right_'+test_target] for case in incident_topologies[int(begin_to_test_ratio*len(incident_topologies)):]]).value_counts()
summary['A@1'] = summary[True]/(summary[True]+summary[False])
print(summary)

True     72.000000
False    12.000000
A@1       0.857143
Name: count, dtype: float64


In [9]:
for case in incident_topologies:
    case['pred_alertcount'] = root_cause_localization(case, ['AlertCount'], [], None)

In [10]:
for case in incident_topologies:
    case['right_alertcount'] = case['root_cause'] in case['pred_alertcount']


In [11]:
test_target = 'alertcount'
ratio = 0
summary = pd.Series([case['right_'+test_target] for case in incident_topologies[int(ratio*len(incident_topologies)):]]).value_counts()
summary['A@1'] = summary[True]/(summary[True]+summary[False])
print(summary)

True     160.000000
False     50.000000
A@1        0.761905
Name: count, dtype: float64


In [12]:
test_target = 'alertcount'
ratio = 0.3
summary = pd.Series([case['right_'+test_target] for case in incident_topologies[int(ratio*len(incident_topologies)):]]).value_counts()
summary['A@1'] = summary[True]/(summary[True]+summary[False])
print(summary)

True     117.000000
False     30.000000
A@1        0.795918
Name: count, dtype: float64


In [13]:
test_target = 'alertcount'
ratio = 0.6
summary = pd.Series([case['right_'+test_target] for case in incident_topologies[int(ratio*len(incident_topologies)):]]).value_counts()
summary['A@1'] = summary[True]/(summary[True]+summary[False])
print(summary)

True     68.000000
False    16.000000
A@1       0.809524
Name: count, dtype: float64


In [14]:
for i, case in enumerate(incident_topologies):
    
    case['pred_incremental'] = root_cause_localization(case, node_clue_tags, edge_clue_tags, a, get_edge_weight, edge_backward_factor)

    sorted_refined_explanation_power = explain(case, 'root_cause')
    print(sorted_refined_explanation_power)
    print(case['failure_type'])

Node CPU Failure
[]
Node CPU Failure
Kubernetes Container Read I/O Load
Node Disk Read I/O Consumption
Kubernetes Container Memory Load
Node Disk Read I/O Consumption
[]
Kubernetes Container Network Packet Loss
Kubernetes Container Network Resource Packet Duplication
Node Disk Read I/O Consumption
Kubernetes Container Network Packet Loss
Kubernetes Container Network Packet Loss
Kubernetes Container CPU Load
[]
Kubernetes Container Network Resource Packet Duplication
Kubernetes Container CPU Load
Node Memory Consumption
Node Disk Space Consumption
Node Disk Space Consumption
Kubernetes Container Network Latency
Kubernetes Container Process Termination
Kubernetes Container CPU Load
Kubernetes Container Write I/O Load
Kubernetes Container Network Resource Packet Corruption
Kubernetes Container Process Termination
Kubernetes Container CPU Load
Node Disk Write I/O Consumption
Kubernetes Container Network Latency
Kubernetes Container Write I/O Load
Kubernetes Container Read I/O Load
Node Dis