In [7]:
from tqdm.notebook import tqdm
import multiprocessing
from typing import *
import pandas as pd
import os
import json
from pathlib import Path
from collections import defaultdict
import pickle
from concurrent.futures import ProcessPoolExecutor

DATASET = 'B'
TEST_RATE = .4
SAVE_ROOT = f'../data/{DATASET}'
os.makedirs(SAVE_ROOT, exist_ok=True)

### Run Table

In [3]:
def cmdb_map(row):
    cmdb_keywords = {'Tomcat', 'Redis', 'Mysql', 'IG', 'MG'}
    if any(keyword in row['cmdb_tmp'] for keyword in cmdb_keywords) and 'OS' in row['node_type']:
        return f"{row['cmdb_tmp']}_OS"
    return row['cmdb_tmp']

faults = pd.read_csv(f"/SSF/data/{DATASET}/faults.csv")
faults_new = faults.drop(faults[faults['node_type']=='OS Memory;Tomcat Memory'].index)
print(faults_new.node_type.unique())
faults_new['cmdb_tmp'] = faults_new['root_cause_node'].apply(lambda x: x.split('_')[0])
faults_new['cmdb_id'] = faults_new.apply(cmdb_map, axis=1)
faults_new['failure_type'] = faults_new['node_type']
faults_new = faults_new[['timestamp', 'cmdb_id', 'failure_type']]
faults_new.loc[68, 'cmdb_id'] = 'MG02;MG02_OS'
faults_new.loc[72, 'cmdb_id'] = 'MG02;MG02_OS'
faults_new.loc[82, 'cmdb_id'] = 'MG02;MG02_OS'
faults_new.loc[86, 'cmdb_id'] = 'MG02;MG02_OS'
faults_new.loc[90, 'cmdb_id'] = 'IG02;IG02_OS'
faults_new.loc[92, 'cmdb_id'] = 'IG01;IG01_OS'
faults = faults_new
faults

['OS CPU' 'OS Disk' 'OS Memory' 'OS Network' 'JVM CPU;OS CPU'
 'JVM CPU;JVM Memory;OS Memory' 'OS CPU;OS Network' 'OS Network;OS CPU'
 'OS CPU;JVM CPU' 'JVM CPU;JVM Memory']


Unnamed: 0,timestamp,cmdb_id,failure_type
0,1616428980,Tomcat03_OS,OS CPU
1,1616432460,MG01_OS,OS Disk
2,1616435220,MG01_OS,OS Disk
3,1616438160,Tomcat01_OS,OS Memory
4,1616448480,Tomcat02_OS,OS Network
...,...,...,...
152,1615015140,IG02_OS,OS Disk
154,1615027920,apache02,OS Network
155,1615034640,Tomcat04_OS,OS CPU
156,1615043880,Tomcat01_OS,OS Network


In [5]:
faults['instance'] = faults['cmdb_id']
faults['anomaly_type'] = faults['failure_type']
faults['st_time'] = faults['timestamp']
run_table = faults[['instance', 'st_time', 'anomaly_type']]
run_table["anomaly_type"] = run_table["anomaly_type"].apply(lambda x: "[" + x + "]")
run_table['service'] = run_table['instance']
run_table["duration"] = [600] * len(run_table)
run_table["ed_time"] = run_table["st_time"] + run_table["duration"]
run_table["data_type"] = ["train"] * len(run_table)
run_table = run_table[['st_time','service','instance','anomaly_type','duration','ed_time','data_type']]
run_table.reset_index(drop=True, inplace=True)

anomaly_cnt = run_table.groupby(by="anomaly_type")["instance"].count()
anomaly_cnt_dict = anomaly_cnt.to_dict()
for anomaly, group in run_table.groupby(by="anomaly_type"):
    sample_cnt = int(anomaly_cnt_dict[anomaly] * TEST_RATE)
    group_index = group.index.to_list()
    test_choices = group_index[-sample_cnt:]
    for choice in test_choices:
        run_table.loc[choice, "data_type"] = "test"

anomaly_types = [item for item in run_table.anomaly_type.unique()]
anomaly_types.sort()
print(' '.join(anomaly_types))

run_table

[JVM CPU;JVM Memory;OS Memory] [JVM CPU;JVM Memory] [JVM CPU;OS CPU] [OS CPU;JVM CPU] [OS CPU;OS Network] [OS CPU] [OS Disk] [OS Memory] [OS Network;OS CPU] [OS Network]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table["anomaly_type"] = run_table["anomaly_type"].apply(lambda x: "[" + x + "]")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table['service'] = run_table['instance']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table["duration"] = [600] * len(run_table)


Unnamed: 0,st_time,service,instance,anomaly_type,duration,ed_time,data_type
0,1616428980,Tomcat03_OS,Tomcat03_OS,[OS CPU],600,1616429580,train
1,1616432460,MG01_OS,MG01_OS,[OS Disk],600,1616433060,train
2,1616435220,MG01_OS,MG01_OS,[OS Disk],600,1616435820,train
3,1616438160,Tomcat01_OS,Tomcat01_OS,[OS Memory],600,1616438760,train
4,1616448480,Tomcat02_OS,Tomcat02_OS,[OS Network],600,1616449080,train
...,...,...,...,...,...,...,...
152,1615015140,IG02_OS,IG02_OS,[OS Disk],600,1615015740,test
153,1615027920,apache02,apache02,[OS Network],600,1615028520,test
154,1615034640,Tomcat04_OS,Tomcat04_OS,[OS CPU],600,1615035240,test
155,1615043880,Tomcat01_OS,Tomcat01_OS,[OS Network],600,1615044480,test


In [6]:
run_table.to_csv(os.path.join(SAVE_ROOT, 'run_table.csv'))

### Anomalies

In [10]:
def read_metric_df(metric_path):
    __df = pd.read_csv(metric_path, index_col=0)
    __df['name'] = __df.apply(lambda _: f"{_['cmdb_id']}##{_['kpi_name']}", axis=1)
    __df.rename(columns={'kpi_name': 'metric_type'}, inplace=True)
    __df.drop(columns=['cmdb_id'], inplace=True)
    return __df

os.makedirs('tmp', exist_ok=True)
## Load performance metrics
perf_metric_out_path = 'tmp/aiops2021_b.perf_metric_df_list.pkl'
if os.path.exists(perf_metric_out_path):
    with open(perf_metric_out_path, 'rb') as f:
        perf_metric_df_list = pickle.load(f)
else:
    perf_metric_paths = list(Path("/data/aiops2021/tzs_data/output_data_b/").glob("*.csv"))
    with ProcessPoolExecutor() as pool:
        perf_metric_df_list = list(pool.map(read_metric_df, tqdm(perf_metric_paths)))
    with open(perf_metric_out_path, 'wb') as f:
        pickle.dump(perf_metric_df_list, f)


## Load kpi metrics
kpi_metric_out_path = 'tmp/aiops2021_b.kpi_df.pkl'
if os.path.exists(kpi_metric_out_path):
    kpi_df = pd.read_pickle(kpi_metric_out_path)
else:
    kpi_df = pd.read_csv('/data/aiops2021/tzs_data/system-b/kpi.csv', index_col=0)
    kpi_df = pd.melt(kpi_df, id_vars=['timestamp', 'tc'], value_vars=['rr', 'sr', 'cnt', 'mrt'], var_name='metric_type', value_name='value')
    kpi_df['name'] = kpi_df.apply(lambda x: f"{x['tc']}##{x['metric_type']}", axis=1)
    kpi_df.drop(columns=['tc'], inplace=True)
    kpi_df.to_pickle(kpi_metric_out_path)


metrics_df = pd.concat([kpi_df] + perf_metric_df_list)
metrics_df

Unnamed: 0,timestamp,metric_type,value,name
0,1616601540,rr,100.0,ServiceTest1##rr
1,1616601540,rr,100.0,ServiceTest6##rr
2,1616601540,rr,100.0,ServiceTest10##rr
3,1616601540,rr,100.0,ServiceTest9##rr
4,1616601540,rr,100.0,ServiceTest5##rr
...,...,...,...,...
1227020,1614786720,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,98.0,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...
1229322,1614786840,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,98.0,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...
1230254,1614786900,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,97.0,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...
1231332,1614787020,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,97.0,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...


In [15]:
def node_map(row):
    cmdb_keywords = {'Tomcat', 'Redis', 'Mysql', 'IG', 'MG'}
    if any(keyword in row['cmdb_id'] for keyword in cmdb_keywords) and 'OSLinux' in row['kpi_name']:
        return f"{row['cmdb_id']}_OS"
    return row['cmdb_id']

metrics_df['cmdb_id'] = metrics_df['name'].apply(lambda x: x.split("##")[0])
metrics_df = metrics_df.rename(columns={"metric_type": "kpi_name"})
print(metrics_df.cmdb_id.unique())
services = ['ServiceTest1','ServiceTest6','ServiceTest10','ServiceTest9','ServiceTest5','ServiceTest11','ServiceTest2','ServiceTest7','ServiceTest3','ServiceTest8','ServiceTest4']
metrics_df_new = metrics_df[~metrics_df['cmdb_id'].isin(services)]
metrics_df_new['node'] = metrics_df_new.apply(node_map, axis=1)
metrics_df = metrics_df_new
metrics_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metrics_df_new['node'] = metrics_df_new.apply(node_map, axis=1)


Unnamed: 0,timestamp,kpi_name,value,name,cmdb_id,node
996,1616601600,OSLinux-OSLinux_FILESYSTEM_----_FSCapacity,30.317445,Tomcat04##OSLinux-OSLinux_FILESYSTEM_----_FSCa...,Tomcat04,Tomcat04_OS
2750,1616601720,OSLinux-OSLinux_FILESYSTEM_----_FSCapacity,30.317472,Tomcat04##OSLinux-OSLinux_FILESYSTEM_----_FSCa...,Tomcat04,Tomcat04_OS
4285,1616601840,OSLinux-OSLinux_FILESYSTEM_----_FSCapacity,30.317499,Tomcat04##OSLinux-OSLinux_FILESYSTEM_----_FSCa...,Tomcat04,Tomcat04_OS
5578,1616601960,OSLinux-OSLinux_FILESYSTEM_----_FSCapacity,30.317499,Tomcat04##OSLinux-OSLinux_FILESYSTEM_----_FSCa...,Tomcat04,Tomcat04_OS
7547,1616602080,OSLinux-OSLinux_FILESYSTEM_----_FSCapacity,30.317499,Tomcat04##OSLinux-OSLinux_FILESYSTEM_----_FSCa...,Tomcat04,Tomcat04_OS
...,...,...,...,...,...,...
1227020,1614786720,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,98.000000,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...,Mysql02,Mysql02_OS
1229322,1614786840,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,98.000000,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...,Mysql02,Mysql02_OS
1230254,1614786900,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,97.000000,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...,Mysql02,Mysql02_OS
1231332,1614787020,OSLinux-OSLinux_MEMORY_MEMORY_MEMUsedMemPerc,97.000000,Mysql02##OSLinux-OSLinux_MEMORY_MEMORY_MEMUsed...,Mysql02,Mysql02_OS


In [21]:
print("nodes: ")
nodes = list(metrics_df.node.unique())
nodes.sort()
print(' '.join(nodes))

node_hash = {node: index for index, node in enumerate(nodes)}
src = [26, 26, 18, 18, 19, 19, 6, 7, 29, 29, 16, 16, 17, 17, 22, 22, 23, 23, 20, 20, 21, 21, 16, 16, 17, 17, 4, 4, 5, 5, 20, 20, 21, 21, 0, 0, 1, 1, 2, 2, 3, 3, 22, 22, 23, 23, 0, 0, 1, 1, 4, 5, 27, 27, 27, 27, 28, 28, 4, 5, 14, 14, 15, 15, 28, 29, 29, 6, 6, 7, 7, 24, 26, 26, 29, 29, 29, 29, 16, 16, 17, 17, 6, 7, 22, 22, 23, 23, 20, 20, 21, 21, 18, 18, 19, 19, 16, 16, 17, 17, 29, 0, 0, 1, 1, 2, 2, 3, 3, 28, 28, 2, 2, 3, 3, 4, 5, 27, 27, 25, 26, 26, 26, 26, 8, 8, 9, 9, 10, 10, 11, 11, 26, 6, 7, 16, 16, 17, 17, 0, 0, 1, 1, 18, 18, 19, 19, 6, 7, 2, 2, 3, 3, 18, 18, 19, 19, 0, 0, 1, 1, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 22, 22, 23, 23, 20, 20, 21, 21, 28, 28, 28, 28, 2, 2, 3, 3, 27, 27, 12, 12, 13, 13, 4, 5, 27]
tgt = [14, 15, 4, 5, 4, 5, 28, 28, 6, 7, 6, 7, 6, 7, 4, 5, 4, 5, 12, 13, 12, 13, 12, 13, 12, 13, 4, 5, 4, 5, 6, 7, 6, 7, 16, 17, 16, 17, 18, 19, 18, 19, 22, 23, 22, 23, 22, 23, 22, 23, 26, 26, 4, 5, 10, 11, 14, 15, 29, 29, 14, 15, 14, 15, 28, 14, 15, 6, 7, 6, 7, 24, 6, 7, 4, 5, 10, 11, 4, 5, 4, 5, 27, 27, 8, 9, 8, 9, 4, 5, 4, 5, 8, 9, 8, 9, 16, 17, 16, 17, 29, 20, 21, 20, 21, 22, 23, 22, 23, 6, 7, 16, 17, 16, 17, 28, 28, 14, 15, 25, 4, 5, 10, 11, 8, 9, 8, 9, 10, 11, 10, 11, 26, 26, 26, 8, 9, 8, 9, 0, 1, 0, 1, 18, 19, 18, 19, 29, 29, 2, 3, 2, 3, 6, 7, 6, 7, 18, 19, 18, 19, 12, 13, 12, 13, 8, 9, 8, 9, 6, 7, 6, 7, 12, 13, 12, 13, 20, 21, 20, 21, 4, 5, 10, 11, 20, 21, 20, 21, 6, 7, 12, 13, 12, 13, 27, 27, 27]
print('src: ', src)
print('tgt: ', tgt)

nodes: 
IG01 IG01_OS IG02 IG02_OS MG01 MG01_OS MG02 MG02_OS Mysql01 Mysql01_OS Mysql02 Mysql02_OS Redis01 Redis01_OS Redis02 Redis02_OS Tomcat01 Tomcat01_OS Tomcat02 Tomcat02_OS Tomcat03 Tomcat03_OS Tomcat04 Tomcat04_OS apache01 apache02 dockerA1 dockerA2 dockerB1 dockerB2
src:  [26, 26, 18, 18, 19, 19, 6, 7, 29, 29, 16, 16, 17, 17, 22, 22, 23, 23, 20, 20, 21, 21, 16, 16, 17, 17, 4, 4, 5, 5, 20, 20, 21, 21, 0, 0, 1, 1, 2, 2, 3, 3, 22, 22, 23, 23, 0, 0, 1, 1, 4, 5, 27, 27, 27, 27, 28, 28, 4, 5, 14, 14, 15, 15, 28, 29, 29, 6, 6, 7, 7, 24, 26, 26, 29, 29, 29, 29, 16, 16, 17, 17, 6, 7, 22, 22, 23, 23, 20, 20, 21, 21, 18, 18, 19, 19, 16, 16, 17, 17, 29, 0, 0, 1, 1, 2, 2, 3, 3, 28, 28, 2, 2, 3, 3, 4, 5, 27, 27, 25, 26, 26, 26, 26, 8, 8, 9, 9, 10, 10, 11, 11, 26, 6, 7, 16, 16, 17, 17, 0, 0, 1, 1, 18, 18, 19, 19, 6, 7, 2, 2, 3, 3, 18, 18, 19, 19, 0, 0, 1, 1, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 22, 22, 23, 23, 20, 20, 21, 21, 28, 28, 28, 28, 2, 2, 3, 3, 27, 27, 12, 12, 13, 13, 4, 5,

In [22]:
import sys
sys.path.insert(0, '/SSF/DejaVu-Omni/DiagFusion/')
from detector.k_sigma import Ksigma

def process_task(df, case_id, st_time, ed_time):
    detector = Ksigma()
    rt = []
    scheduler = tqdm(total=len(df), desc=f"case:{case_id}, detecting")
    for instance, ins_group in df.groupby(by="node"):
        for kpi, kpi_group in ins_group.groupby(by="kpi_name"):
            res = detector.detection(kpi_group, "value", st_time, ed_time)
            if res[0] is True:
                rt.append([int(res[1]), instance, kpi, res[2]])
        scheduler.update(len(ins_group))
    return rt

In [23]:
metric_dict = {}
tasks = []
pool = multiprocessing.Pool(processes=10)
for case_id, case in run_table.iterrows():
    # 故障前60个点，故障后0个点
    sample_interval = 60
    st_time = case["st_time"] - (sample_interval * 60)
    ed_time = case["ed_time"] + (sample_interval * 0)
    task = pool.apply_async(
        process_task,
        (
            metrics_df.query(f"timestamp >= {st_time} & timestamp < {ed_time}"),
            case_id,
            st_time,
            ed_time,
        ),
    )
    tasks.append((case_id, task))
pool.close()
pool.join()
for case_id, task in tasks:
    metric_dict[case_id] = task.get()

In [24]:
anomaly_path = os.path.join(SAVE_ROOT, 'anomalies')
os.makedirs(anomaly_path, exist_ok=True)
with open(os.path.join(anomaly_path, 'demo_metric.json'), "w") as w:
    json.dump(metric_dict, w)

: 