In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
from tqdm import tqdm
import numpy as np
import pickle
from torch_geometric.data import Data, DataLoader
import torch
from joblib import Parallel, delayed
import networkx as nx
import math
import optuna

import sys
sys.path.append('../src')
from incident_diagnosis.incident_diagnosis import root_cause_localization, explain, optimize, get_weight_from_edge_info

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('../data/AIOPS2021.pkl', 'rb') as f:
    incident_topologies = pickle.load(f)

In [3]:
len(incident_topologies)

133

In [4]:
import logging

# 设置日志级别为 WARNING，这将只打印警告和错误信息，不打印详细的优化过程信息
optuna.logging.set_verbosity(optuna.logging.WARNING)
init_clue_tag = 'AlertCount'
node_clue_tags=['AlertCount']
edge_clue_tags=[]
a = {}
for clue_tag in edge_clue_tags:
    a[clue_tag] = 1
for clue_tag in node_clue_tags:
    a[clue_tag] = 1
get_edge_weight=None
edge_backward_factor=0.3
for i, case in enumerate(incident_topologies):
    
    case['pred_incremental'] = root_cause_localization(case, node_clue_tags, edge_clue_tags, a, get_edge_weight, edge_backward_factor)

    if case['root_cause'] not in case['pred_incremental'] and case['pred_incremental'] != 'None':
        #optimize
        node_clue_tags, a = optimize(case, node_clue_tags, edge_clue_tags, a, get_edge_weight, edge_backward_factor, incident_topologies[:i+1], init_clue_tag)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:01<00:00, 55.94it/s]


best trial: FrozenTrial(number=73, state=TrialState.COMPLETE, values=[0.8333333333333334, 1.0], datetime_start=datetime.datetime(2024, 12, 20, 12, 51, 24, 224021), datetime_complete=datetime.datetime(2024, 12, 20, 12, 51, 24, 242770), params={'a:AlertCount': 0.008428739408411101, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 0.7767187040175068}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'a:AlertCount': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': FloatDistribution(high=5.0, log=False, low=0.0, step=None)}, trial_id=73, value=None)
A better solution found
a: {'AlertCount': 0.008428739408411101, 'OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 0.7767187040175068}


100%|██████████| 100/100 [00:02<00:00, 39.47it/s]


best trial: FrozenTrial(number=0, state=TrialState.COMPLETE, values=[0.5882352941176471, 1], datetime_start=datetime.datetime(2024, 12, 20, 12, 51, 24, 830708), datetime_complete=datetime.datetime(2024, 12, 20, 12, 51, 24, 830708), params={'a:AlertCount': 1, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 0, 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': 0}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'a:AlertCount': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': FloatDistribution(high=5.0, log=False, low=0.0, step=None)}, trial_id=0, value=None)
a: {'AlertCount': 1, 'OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 0, 'JVM-Operating System_7779_JVM_JVM_CPULoad': 0}


100%|██████████| 100/100 [00:03<00:00, 30.25it/s]


best trial: FrozenTrial(number=90, state=TrialState.COMPLETE, values=[0.6111111111111112, 2.746827572678805], datetime_start=datetime.datetime(2024, 12, 20, 12, 51, 30, 263596), datetime_complete=datetime.datetime(2024, 12, 20, 12, 51, 30, 300774), params={'a:AlertCount': 0.5951518262529558, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 0.49555306058886855, 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': 0.7443333197061959, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKTps': 0.9117893661307849}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'a:AlertCount': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKTps': FloatDistribution(high=5.0, log=False, low=0.0, step=None)}, trial_id=90, 

100%|██████████| 100/100 [00:04<00:00, 21.92it/s]


best trial: FrozenTrial(number=46, state=TrialState.COMPLETE, values=[0.6666666666666666, 7.693714699433357], datetime_start=datetime.datetime(2024, 12, 20, 12, 51, 32, 594597), datetime_complete=datetime.datetime(2024, 12, 20, 12, 51, 32, 640398), params={'a:AlertCount': 0.6364573260296996, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 3.6341649616933878, 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': 1.1799948682528518, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKTps': 0.812065432275479, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKPercentBusy': 1.431032111181939}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'a:AlertCount': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKTps

100%|██████████| 100/100 [00:06<00:00, 15.72it/s]


best trial: FrozenTrial(number=68, state=TrialState.COMPLETE, values=[0.71875, 3.985230156021407], datetime_start=datetime.datetime(2024, 12, 20, 12, 51, 39, 465543), datetime_complete=datetime.datetime(2024, 12, 20, 12, 51, 39, 532586), params={'a:AlertCount': 0.547274489769302, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 0.382232720797675, 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': 0.600102556289185, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKTps': 0.7614948972779474, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKPercentBusy': 0.9563042926877249, 'a:ig_post': 0.7378211991995727}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'a:AlertCount': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:OSLinux-OSLinux_LOCALDISK

100%|██████████| 100/100 [00:11<00:00,  8.74it/s]


best trial: FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.8, 3.985230156021407], datetime_start=datetime.datetime(2024, 12, 20, 12, 51, 42, 62797), datetime_complete=datetime.datetime(2024, 12, 20, 12, 51, 42, 62797), params={'a:AlertCount': 0.547274489769302, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 0.382232720797675, 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': 0.600102556289185, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKTps': 0.7614948972779474, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKPercentBusy': 0.9563042926877249, 'a:ig_post': 0.7378211991995727, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKWrite': 0}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'a:AlertCount': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': FloatDistribution(high=5.0, log=False,

100%|██████████| 100/100 [00:15<00:00,  6.31it/s]


best trial: FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.8313253012048193, 3.985230156021407], datetime_start=datetime.datetime(2024, 12, 20, 12, 51, 53, 814072), datetime_complete=datetime.datetime(2024, 12, 20, 12, 51, 53, 814072), params={'a:AlertCount': 0.547274489769302, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 0.382232720797675, 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': 0.600102556289185, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKTps': 0.7614948972779474, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKPercentBusy': 0.9563042926877249, 'a:ig_post': 0.7378211991995727, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKWrite': 0, 'a:severe': 0}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'a:AlertCount': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': FloatD

100%|██████████| 100/100 [00:17<00:00,  5.58it/s]


best trial: FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.8314606741573034, 3.985230156021407], datetime_start=datetime.datetime(2024, 12, 20, 12, 52, 9, 973439), datetime_complete=datetime.datetime(2024, 12, 20, 12, 52, 9, 973439), params={'a:AlertCount': 0.547274489769302, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 0.382232720797675, 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': 0.600102556289185, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKTps': 0.7614948972779474, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKPercentBusy': 0.9563042926877249, 'a:ig_post': 0.7378211991995727, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKWrite': 0, 'a:severe': 0, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKWTps': 0}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'a:AlertCount': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': FloatDistribution(high=5.0, log=False, low=0.0, step=None),

100%|██████████| 100/100 [00:20<00:00,  4.96it/s]


best trial: FrozenTrial(number=82, state=TrialState.COMPLETE, values=[0.8350515463917526, 14.318742164322975], datetime_start=datetime.datetime(2024, 12, 20, 12, 52, 44, 163174), datetime_complete=datetime.datetime(2024, 12, 20, 12, 52, 44, 377163), params={'a:AlertCount': 1.2387638882250087, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 0.6055077542892814, 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': 2.0232380163735817, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKTps': 0.3528424159508297, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKPercentBusy': 1.2951392412742593, 'a:ig_post': 4.224479929559377, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKWrite': 0.3061519665418823, 'a:severe': 3.9773045895561054, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKWTps': 0.13916339024266688, 'a:OSLinux-OSLinux_NETWORK_NETWORK_TCP-FIN-WAIT': 0.1561509723099836}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'a:AlertCount': FloatDistribution(high=5.0, log=False, low=0

100%|██████████| 100/100 [00:22<00:00,  4.49it/s]


best trial: FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.8348623853211009, 14.318742164322975], datetime_start=datetime.datetime(2024, 12, 20, 12, 52, 48, 821824), datetime_complete=datetime.datetime(2024, 12, 20, 12, 52, 48, 821824), params={'a:AlertCount': 1.2387638882250087, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 0.6055077542892814, 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': 2.0232380163735817, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKTps': 0.3528424159508297, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKPercentBusy': 1.2951392412742593, 'a:ig_post': 4.224479929559377, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKWrite': 0.3061519665418823, 'a:severe': 3.9773045895561054, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKWTps': 0.13916339024266688, 'a:OSLinux-OSLinux_NETWORK_NETWORK_TCP-FIN-WAIT': 0.1561509723099836}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'a:AlertCount': FloatDistribution(high=5.0, log=False, low=0.

100%|██████████| 100/100 [00:26<00:00,  3.72it/s]

best trial: FrozenTrial(number=22, state=TrialState.COMPLETE, values=[0.8267716535433071, 6.476223985491053], datetime_start=datetime.datetime(2024, 12, 20, 12, 53, 16, 534186), datetime_complete=datetime.datetime(2024, 12, 20, 12, 53, 16, 798631), params={'a:AlertCount': 0.46503665532967553, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKRead': 0.5924735168814377, 'a:JVM-Operating System_7779_JVM_JVM_CPULoad': 0.5811546871894936, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sda_DSKTps': 0.7708957831212384, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKPercentBusy': 0.321977622220258, 'a:ig_post': 0.5657085840642232, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKWrite': 0.26269341867717433, 'a:severe': 0.054478300562137774, 'a:OSLinux-OSLinux_LOCALDISK_LOCALDISK-sdb_DSKWTps': 1.0106743003797187, 'a:OSLinux-OSLinux_NETWORK_NETWORK_TCP-FIN-WAIT': 0.4011953702888691, 'a:OSLinux-CPU_CPU_CPUUserTime': 1.449935746776827}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'a:Ale




In [5]:
for case in incident_topologies:
    case['right_incremental'] = case['root_cause'] in case['pred_incremental']


In [6]:
test_target = 'incremental'
begin_to_test_ratio = 0
summary = pd.Series([case['right_'+test_target] for case in incident_topologies[int(begin_to_test_ratio*len(incident_topologies)):]]).value_counts()
summary['A@1'] = summary[True]/(summary[True]+summary[False])
print(summary)

True     105.000000
False     28.000000
A@1        0.789474
Name: count, dtype: float64


In [7]:
test_target = 'incremental'
begin_to_test_ratio = 0.3
summary = pd.Series([case['right_'+test_target] for case in incident_topologies[int(begin_to_test_ratio*len(incident_topologies)):]]).value_counts()
summary['A@1'] = summary[True]/(summary[True]+summary[False])
print(summary)

True     81.000000
False    13.000000
A@1       0.861702
Name: count, dtype: float64


In [8]:
test_target = 'incremental'
begin_to_test_ratio = 0.6
summary = pd.Series([case['right_'+test_target] for case in incident_topologies[int(begin_to_test_ratio*len(incident_topologies)):]]).value_counts()
summary['A@1'] = summary[True]/(summary[True]+summary[False])
print(summary)

True     43.000000
False    11.000000
A@1       0.796296
Name: count, dtype: float64


In [9]:
for case in incident_topologies:
    case['pred_alertcount'] = root_cause_localization(case, ['AlertCount'], [], None)

In [10]:
for case in incident_topologies:
    case['right_alertcount'] = case['root_cause'] in case['pred_alertcount']


In [11]:
test_target = 'alertcount'
ratio = 0
summary = pd.Series([case['right_'+test_target] for case in incident_topologies[int(ratio*len(incident_topologies)):]]).value_counts()
summary['A@1'] = summary[True]/(summary[True]+summary[False])
print(summary)

True     103.000000
False     30.000000
A@1        0.774436
Name: count, dtype: float64


In [12]:
test_target = 'alertcount'
ratio = 0.3
summary = pd.Series([case['right_'+test_target] for case in incident_topologies[int(ratio*len(incident_topologies)):]]).value_counts()
summary['A@1'] = summary[True]/(summary[True]+summary[False])
print(summary)

True     78.000000
False    16.000000
A@1       0.829787
Name: count, dtype: float64


In [13]:
test_target = 'alertcount'
ratio = 0.6
summary = pd.Series([case['right_'+test_target] for case in incident_topologies[int(ratio*len(incident_topologies)):]]).value_counts()
summary['A@1'] = summary[True]/(summary[True]+summary[False])
print(summary)

True     40.000000
False    14.000000
A@1       0.740741
Name: count, dtype: float64


In [14]:
for i, case in enumerate(incident_topologies):
    
    case['pred_incremental'] = root_cause_localization(case, node_clue_tags, edge_clue_tags, a, get_edge_weight, edge_backward_factor)

    sorted_refined_explanation_power = explain(case, 'root_cause')
    print(sorted_refined_explanation_power)

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
