In [1]:
from tqdm.notebook import tqdm
import multiprocessing
from typing import *
import pandas as pd
import os
import json
from pathlib import Path
from collections import defaultdict
import pickle
from concurrent.futures import ProcessPoolExecutor

DATASET = 'B'
TEST_RATE = .4
SAVE_ROOT = f'../data/{DATASET}'
os.makedirs(SAVE_ROOT, exist_ok=True)

### Run Table

In [4]:
faults = pd.read_csv(f"/SSF/data/{DATASET}/faults.csv")
faults_new = faults.drop(faults[faults['node_type']=='OS Memory;Tomcat Memory'].index)
print(faults_new.node_type.unique())
faults_new['cmdb_id'] = faults_new['root_cause_node'].apply(lambda x: x.split('_')[0])
faults_new['failure_type'] = faults_new['node_type']
faults_new = faults_new[['timestamp', 'cmdb_id', 'failure_type']]
faults = faults_new
faults

['OS CPU' 'OS Disk' 'OS Memory' 'OS Network' 'JVM CPU;OS CPU'
 'JVM CPU;JVM Memory;OS Memory' 'OS CPU;OS Network' 'OS Network;OS CPU'
 'OS CPU;JVM CPU' 'JVM CPU;JVM Memory']


Unnamed: 0,timestamp,cmdb_id,failure_type
0,1616428980,Tomcat03,OS CPU
1,1616432460,MG01,OS Disk
2,1616435220,MG01,OS Disk
3,1616438160,Tomcat01,OS Memory
4,1616448480,Tomcat02,OS Network
...,...,...,...
152,1615015140,IG02,OS Disk
154,1615027920,apache02,OS Network
155,1615034640,Tomcat04,OS CPU
156,1615043880,Tomcat01,OS Network


In [5]:
faults['instance'] = faults['cmdb_id']
faults['anomaly_type'] = faults['failure_type']
faults['st_time'] = faults['timestamp']
run_table = faults[['instance', 'st_time', 'anomaly_type']]
run_table["anomaly_type"] = run_table["anomaly_type"].apply(lambda x: "[" + x + "]")
run_table['service'] = run_table['instance']
run_table["duration"] = [600] * len(run_table)
run_table["ed_time"] = run_table["st_time"] + run_table["duration"]
run_table["data_type"] = ["train"] * len(run_table)
run_table = run_table[['st_time','service','instance','anomaly_type','duration','ed_time','data_type']]
run_table.reset_index(drop=True, inplace=True)

anomaly_cnt = run_table.groupby(by="anomaly_type")["instance"].count()
anomaly_cnt_dict = anomaly_cnt.to_dict()
for anomaly, group in run_table.groupby(by="anomaly_type"):
    sample_cnt = int(anomaly_cnt_dict[anomaly] * TEST_RATE)
    group_index = group.index.to_list()
    test_choices = group_index[-sample_cnt:]
    for choice in test_choices:
        run_table.loc[choice, "data_type"] = "test"

anomaly_types = [item for item in run_table.anomaly_type.unique()]
anomaly_types.sort()
print(' '.join(anomaly_types))

run_table

[JVM CPU;JVM Memory;OS Memory] [JVM CPU;JVM Memory] [JVM CPU;OS CPU] [OS CPU;JVM CPU] [OS CPU;OS Network] [OS CPU] [OS Disk] [OS Memory] [OS Network;OS CPU] [OS Network]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table["anomaly_type"] = run_table["anomaly_type"].apply(lambda x: "[" + x + "]")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table['service'] = run_table['instance']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table["duration"] = [600] * len(run_table)


Unnamed: 0,st_time,service,instance,anomaly_type,duration,ed_time,data_type
0,1616428980,Tomcat03,Tomcat03,[OS CPU],600,1616429580,train
1,1616432460,MG01,MG01,[OS Disk],600,1616433060,train
2,1616435220,MG01,MG01,[OS Disk],600,1616435820,train
3,1616438160,Tomcat01,Tomcat01,[OS Memory],600,1616438760,train
4,1616448480,Tomcat02,Tomcat02,[OS Network],600,1616449080,train
...,...,...,...,...,...,...,...
152,1615015140,IG02,IG02,[OS Disk],600,1615015740,test
153,1615027920,apache02,apache02,[OS Network],600,1615028520,test
154,1615034640,Tomcat04,Tomcat04,[OS CPU],600,1615035240,test
155,1615043880,Tomcat01,Tomcat01,[OS Network],600,1615044480,test


In [6]:
run_table.to_csv(os.path.join(SAVE_ROOT, 'run_table.csv'))

### Anomalies

In [7]:
def read_metric_df(metric_path):
    __df = pd.read_csv(metric_path, index_col=0)
    __df['name'] = __df.apply(lambda _: f"{_['cmdb_id']}##{_['kpi_name']}", axis=1)
    __df.rename(columns={'kpi_name': 'metric_type'}, inplace=True)
    __df.drop(columns=['cmdb_id'], inplace=True)
    return __df

os.makedirs('tmp', exist_ok=True)
## Load performance metrics
perf_metric_out_path = 'tmp/aiops2021_b.perf_metric_df_list.pkl'
if os.path.exists(perf_metric_out_path):
    with open(perf_metric_out_path, 'rb') as f:
        perf_metric_df_list = pickle.load(f)
else:
    perf_metric_paths = list(Path("/data/aiops2021/tzs_data/output_data_b/").glob("*.csv"))
    with ProcessPoolExecutor() as pool:
        perf_metric_df_list = list(pool.map(read_metric_df, tqdm(perf_metric_paths)))
    with open(perf_metric_out_path, 'wb') as f:
        pickle.dump(perf_metric_df_list, f)


## Load kpi metrics
kpi_metric_out_path = 'tmp/aiops2021_b.kpi_df.pkl'
if os.path.exists(kpi_metric_out_path):
    kpi_df = pd.read_pickle(kpi_metric_out_path)
else:
    kpi_df = pd.read_csv('/data/aiops2021/tzs_data/system-b/kpi.csv', index_col=0)
    kpi_df = pd.melt(kpi_df, id_vars=['timestamp', 'tc'], value_vars=['rr', 'sr', 'cnt', 'mrt'], var_name='metric_type', value_name='value')
    kpi_df['name'] = kpi_df.apply(lambda x: f"{x['tc']}##{x['metric_type']}", axis=1)
    kpi_df.drop(columns=['tc'], inplace=True)
    kpi_df.to_pickle(kpi_metric_out_path)


metrics_df = pd.concat([kpi_df] + perf_metric_df_list)
metrics_df

Unnamed: 0,timestamp,metric_type,value,name
0,1616601540,rr,100.0,ServiceTest1##rr
1,1616601540,rr,100.0,ServiceTest6##rr
2,1616601540,rr,100.0,ServiceTest10##rr
3,1616601540,rr,100.0,ServiceTest9##rr
4,1616601540,rr,100.0,ServiceTest5##rr
...,...,...,...,...
1227020,1614786720,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,98.0,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...
1229322,1614786840,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,98.0,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...
1230254,1614786900,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,97.0,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...
1231332,1614787020,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,97.0,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...


In [8]:
metrics_df['cmdb_id'] = metrics_df['name'].apply(lambda x: x.split("##")[0])
metrics_df = metrics_df.rename(columns={"metric_type": "kpi_name"})
print(metrics_df.cmdb_id.unique())
services = ['ServiceTest1','ServiceTest6','ServiceTest10','ServiceTest9','ServiceTest5','ServiceTest11','ServiceTest2','ServiceTest7','ServiceTest3','ServiceTest8','ServiceTest4']
metrics_df_new = metrics_df[~metrics_df['cmdb_id'].isin(services)]
metrics_df = metrics_df_new
metrics_df

['ServiceTest1' 'ServiceTest6' 'ServiceTest10' 'ServiceTest9'
 'ServiceTest5' 'ServiceTest11' 'ServiceTest2' 'ServiceTest7'
 'ServiceTest3' 'ServiceTest8' 'ServiceTest4' 'Tomcat04' 'MG01' 'IG01'
 'Redis01' 'Redis02' 'apache01' 'Mysql02' 'dockerB1' 'apache02' 'dockerA2'
 'Tomcat02' 'Mysql01' 'MG02' 'Tomcat01' 'Tomcat03' 'IG02' 'dockerA1'
 'dockerB2']


Unnamed: 0,timestamp,kpi_name,value,name,cmdb_id
996,1616601600,OSLinux-OSLinux_FILESYSTEM_----_FSCapacity,30.317445,Tomcat04##OSLinux-OSLinux_FILESYSTEM_----_FSCa...,Tomcat04
2750,1616601720,OSLinux-OSLinux_FILESYSTEM_----_FSCapacity,30.317472,Tomcat04##OSLinux-OSLinux_FILESYSTEM_----_FSCa...,Tomcat04
4285,1616601840,OSLinux-OSLinux_FILESYSTEM_----_FSCapacity,30.317499,Tomcat04##OSLinux-OSLinux_FILESYSTEM_----_FSCa...,Tomcat04
5578,1616601960,OSLinux-OSLinux_FILESYSTEM_----_FSCapacity,30.317499,Tomcat04##OSLinux-OSLinux_FILESYSTEM_----_FSCa...,Tomcat04
7547,1616602080,OSLinux-OSLinux_FILESYSTEM_----_FSCapacity,30.317499,Tomcat04##OSLinux-OSLinux_FILESYSTEM_----_FSCa...,Tomcat04
...,...,...,...,...,...
1227020,1614786720,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,98.000000,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...,Mysql02
1229322,1614786840,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,98.000000,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...,Mysql02
1230254,1614786900,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,97.000000,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...,Mysql02
1231332,1614787020,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,97.000000,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...,Mysql02


In [9]:
print("nodes: ")
nodes = list(metrics_df.cmdb_id.unique())
nodes.sort()
print(' '.join(nodes))

node_hash = {node: index for index, node in enumerate(nodes)}
src = [14,  9,  3, 17,  8, 11, 10,  8,  2, 10,  0,  1, 11,  0,  2, 15, 15, 16, 2,  7, 16, 17,  3, 12, 14, 17, 17,  8,  3, 11, 10,  9,  8, 17,  0,  1, 16,  1,  2, 15, 13, 14, 14,  4,  5, 14,  3,  8,  0,  9,  3,  1,  9,  0, 9, 10, 11, 11, 10, 16, 16,  1, 15,  6,  2, 15]
tgt = [7,  2, 16,  3,  3,  2,  6,  6,  2,  3,  8,  9, 11, 11, 14,  2,  5,  7, 17,  7, 16,  7,  3, 12,  3,  2,  5,  2, 15,  4,  2,  4,  8, 17, 10, 11, 3,  8, 16,  7, 13,  2,  5,  4,  5, 14, 14,  4,  0,  9, 17,  1,  3,  9, 6,  4,  3,  6, 10,  2,  5, 10,  3,  6, 15, 15]
print('src: ', src)
print('tgt: ', tgt)

nodes: 
IG01 IG02 MG01 MG02 Mysql01 Mysql02 Redis01 Redis02 Tomcat01 Tomcat02 Tomcat03 Tomcat04 apache01 apache02 dockerA1 dockerA2 dockerB1 dockerB2
src:  [14, 9, 3, 17, 8, 11, 10, 8, 2, 10, 0, 1, 11, 0, 2, 15, 15, 16, 2, 7, 16, 17, 3, 12, 14, 17, 17, 8, 3, 11, 10, 9, 8, 17, 0, 1, 16, 1, 2, 15, 13, 14, 14, 4, 5, 14, 3, 8, 0, 9, 3, 1, 9, 0, 9, 10, 11, 11, 10, 16, 16, 1, 15, 6, 2, 15]
tgt:  [7, 2, 16, 3, 3, 2, 6, 6, 2, 3, 8, 9, 11, 11, 14, 2, 5, 7, 17, 7, 16, 7, 3, 12, 3, 2, 5, 2, 15, 4, 2, 4, 8, 17, 10, 11, 3, 8, 16, 7, 13, 2, 5, 4, 5, 14, 14, 4, 0, 9, 17, 1, 3, 9, 6, 4, 3, 6, 10, 2, 5, 10, 3, 6, 15, 15]


In [10]:
import sys
sys.path.insert(0, '/SSF/DejaVu-Omni/DiagFusion/')
from detector.k_sigma import Ksigma

def process_task(df, case_id, st_time, ed_time):
    detector = Ksigma()
    rt = []
    scheduler = tqdm(total=len(df), desc=f"case:{case_id}, detecting")
    for instance, ins_group in df.groupby(by="cmdb_id"):
        for kpi, kpi_group in ins_group.groupby(by="kpi_name"):
            res = detector.detection(kpi_group, "value", st_time, ed_time)
            if res[0] is True:
                rt.append([int(res[1]), instance, kpi, res[2]])
        scheduler.update(len(ins_group))
    return rt

In [11]:
metric_dict = {}
tasks = []
pool = multiprocessing.Pool(processes=32)
for case_id, case in run_table.iterrows():
    # 故障前60个点，故障后0个点
    sample_interval = 60
    st_time = case["st_time"] - (sample_interval * 60)
    ed_time = case["ed_time"] + (sample_interval * 0)
    task = pool.apply_async(
        process_task,
        (
            metrics_df.query(f"timestamp >= {st_time} & timestamp < {ed_time}"),
            case_id,
            st_time,
            ed_time,
        ),
    )
    tasks.append((case_id, task))
pool.close()
pool.join()
for case_id, task in tasks:
    metric_dict[case_id] = task.get()

In [12]:
anomaly_path = os.path.join(SAVE_ROOT, 'anomalies')
os.makedirs(anomaly_path, exist_ok=True)
with open(os.path.join(anomaly_path, 'demo_metric.json'), "w") as w:
    json.dump(metric_dict, w)

In [1]:
import json
with open('/SSF/DejaVu-Omni/DiagFusion/data/A2/anomalies/demo_metric.json', 'r') as f:
    metric_dict = json.load(f)
with open('/SSF/DejaVu-Omni/DiagFusion/data/A2/anomalies/demo_metric.format.json', 'w') as f:
    json.dump(metric_dict, f, indent=4)