In [3]:
from tqdm.notebook import tqdm
import multiprocessing
from typing import *
import pandas as pd
import os
import json
from pathlib import Path
from collections import defaultdict

## 预赛

In [None]:
DATASET = 'A1'
TEST_RATE = .4
SAVE_ROOT = f'../data/{DATASET}'
os.makedirs(SAVE_ROOT, exist_ok=True)

### Run Table

In [3]:
faults = pd.read_csv(f"/SSF/data/{DATASET}/faults.csv")
print(faults.node_type.unique())
faults['instance'] = faults['name']
faults['anomaly_type'] = faults['node_type']
faults['st_time'] = faults['timestamp']
run_table = faults[['instance', 'st_time', 'anomaly_type']]
run_table["anomaly_type"] = run_table["anomaly_type"].apply(lambda x: "[" + x + "]")
run_table['service'] = run_table['instance']
run_table["duration"] = [300] * len(run_table)
run_table["ed_time"] = run_table["st_time"] + run_table["duration"]
run_table["data_type"] = ["train"] * len(run_table)
run_table = run_table[['st_time','service','instance','anomaly_type','duration','ed_time','data_type']]
run_table.reset_index(drop=True, inplace=True)

anomaly_cnt = run_table.groupby(by="anomaly_type")["instance"].count()
anomaly_cnt_dict = anomaly_cnt.to_dict()
for anomaly, group in run_table.groupby(by="anomaly_type"):
    sample_cnt = int(anomaly_cnt_dict[anomaly] * TEST_RATE)
    group_index = group.index.to_list()
    test_choices = group_index[-sample_cnt:]
    for choice in test_choices:
        run_table.loc[choice, "data_type"] = "test"

anomaly_types = [item for item in run_table.anomaly_type.unique()]
anomaly_types.sort()
print(' '.join(anomaly_types))

run_table

['Docker CPU' 'Docker' 'DB Session' 'DB State' 'OS Network']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table["anomaly_type"] = run_table["anomaly_type"].apply(lambda x: "[" + x + "]")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table['service'] = run_table['instance']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table["duration"] = [300] * len(run_table)
A value is trying to be set 

Unnamed: 0,st_time,service,instance,anomaly_type,duration,ed_time,data_type
0,1586534700,docker_003,docker_003,[Docker CPU],300,1586535000,train
1,1586536500,docker_002,docker_002,[Docker],300,1586536800,train
2,1586538600,docker_001,docker_001,[Docker],300,1586538900,train
3,1586540400,docker_004,docker_004,[Docker CPU],300,1586540700,train
4,1586542500,db_007,db_007,[DB Session],300,1586542800,train
...,...,...,...,...,...,...,...
73,1590866220,os_017,os_017,[OS Network],300,1590866520,test
74,1590868020,db_003,db_003,[DB Session],300,1590868320,test
75,1590869820,docker_004,docker_004,[Docker CPU],300,1590870120,test
76,1590871620,docker_006,docker_006,[Docker],300,1590871920,test


In [7]:
run_table.to_csv(os.path.join(SAVE_ROOT, 'run_table.csv'))

### Anomalies

In [9]:
metrics_df = pd.concat(list(map(pd.read_csv, filter(lambda _: _.is_file(), Path('/data/aiops2020/AIOps挑战赛数据/').glob('2020_*_*/metrics_all_in_one_CFL.csv')))))
metrics_df['kpi_name'] = metrics_df['name'].map(lambda _: _.split('##')[1])
metrics_df['cmdb_id'] = metrics_df['name'].map(lambda _: _.split('##')[0])
cmdbs_used = ['db_003', 'db_007', 'db_009', 'docker_001', 'docker_002', 'docker_003', 'docker_004', 'docker_005', 'docker_006', 'docker_007', 'docker_008', 'os_017', 'os_018', 'os_019', 'os_020', 'os_021', 'os_022']
metrics_df = metrics_df[metrics_df['cmdb_id'].isin(cmdbs_used)]
metrics_df

Unnamed: 0,name,timestamp,value,kpi_name,cmdb_id
25064,db_003##ACS,1590595200,0.01,ACS,db_003
25065,db_003##ACS,1590595260,0.01,ACS,db_003
25066,db_003##ACS,1590595320,0.01,ACS,db_003
25067,db_003##ACS,1590595380,0.01,ACS,db_003
25068,db_003##ACS,1590595440,0.01,ACS,db_003
...,...,...,...,...,...
451424,os_022##succ_rate,1590875700,1.00,succ_rate,os_022
451425,os_022##succ_rate,1590875760,1.00,succ_rate,os_022
451426,os_022##succ_rate,1590875820,1.00,succ_rate,os_022
451427,os_022##succ_rate,1590875880,1.00,succ_rate,os_022


In [13]:
print("nodes: ")
nodes = list(metrics_df.cmdb_id.unique())
nodes.sort()
print(' '.join(nodes))

node_hash = {node: index for index, node in enumerate(nodes)}
src = ['os_017', 'os_018', 'os_019', 'os_020', 'os_017', 'os_018', 'os_019', 'os_020', 'db_003', 'db_003', 'db_003', 'db_003', 'docker_005', 'docker_006', 'db_007', 'db_009', 'docker_005', 'docker_006', 'db_007', 'db_009', 'docker_007', 'docker_008', 'db_007', 'db_009', 'docker_007', 'docker_008', 'db_007', 'db_009', 'docker_001', 'docker_002', 'docker_003', 'docker_004']
tgt = ['docker_001', 'docker_002', 'docker_003', 'docker_004', 'docker_005', 'docker_006', 'docker_007', 'docker_008', 'docker_005', 'docker_006', 'docker_007', 'docker_008', 'docker_003', 'docker_003', 'docker_003', 'docker_003', 'docker_004', 'docker_004', 'docker_004', 'docker_004', 'docker_001', 'docker_001', 'docker_001', 'docker_001', 'docker_002', 'docker_002', 'docker_002', 'docker_002', 'os_022', 'os_022', 'os_021', 'os_021']
src = [node_hash[item] for item in src]
tgt = [node_hash[item] for item in tgt]
print('src: ', src)
print('tgt: ', tgt)

src:  [11, 12, 13, 14, 11, 12, 13, 14, 0, 0, 0, 0, 7, 8, 1, 2, 7, 8, 1, 2, 9, 10, 1, 2, 9, 10, 1, 2, 3, 4, 5, 6]
tgt:  [3, 4, 5, 6, 7, 8, 9, 10, 7, 8, 9, 10, 5, 5, 5, 5, 6, 6, 6, 6, 3, 3, 3, 3, 4, 4, 4, 4, 16, 16, 15, 15]


In [9]:
import sys
sys.path.insert(0, '/SSF/DejaVu-Omni/DiagFusion/')
from detector.k_sigma import Ksigma

def process_task(df, case_id, st_time, ed_time):
    detector = Ksigma()
    rt = []
    scheduler = tqdm(total=len(df), desc=f"case:{case_id}, detecting")
    for instance, ins_group in df.groupby(by="cmdb_id"):
        for kpi, kpi_group in ins_group.groupby(by="kpi_name"):
            res = detector.detection(kpi_group, "value", st_time, ed_time)
            if res[0] is True:
                rt.append([int(res[1]), instance, kpi, res[2]])
        scheduler.update(len(ins_group))
    return rt

In [13]:
metric_dict = {}
tasks = []
pool = multiprocessing.Pool(processes=10)
for case_id, case in run_table.iterrows():
    # 故障前60个点，故障后0个点
    sample_interval = 60
    st_time = case["st_time"] - (sample_interval * 60)
    ed_time = case["ed_time"] + (sample_interval * 0)
    task = pool.apply_async(
        process_task,
        (
            metrics_df.query(f"timestamp >= {st_time} & timestamp < {ed_time}"),
            case_id,
            st_time,
            ed_time,
        ),
    )
    tasks.append((case_id, task))
pool.close()
pool.join()
for case_id, task in tasks:
    metric_dict[case_id] = task.get()

In [15]:
anomaly_path = os.path.join(SAVE_ROOT, 'anomalies')
os.makedirs(anomaly_path, exist_ok=True)
with open(os.path.join(anomaly_path, 'demo_metric.json'), "w") as w:
    json.dump(metric_dict, w)

## 复赛

In [4]:
DATASET = 'A2'
TEST_RATE = .4
SAVE_ROOT = f'../data/{DATASET}'
os.makedirs(SAVE_ROOT, exist_ok=True)

### Run Table

In [5]:
faults = pd.read_csv(f"/SSF/data/{DATASET}/faults.csv")
print(faults.node_type.unique())
faults['instance'] = faults['name']
faults['anomaly_type'] = faults['node_type']
faults['st_time'] = faults['timestamp']
run_table = faults[['instance', 'st_time', 'anomaly_type']]
run_table["anomaly_type"] = run_table["anomaly_type"].apply(lambda x: "[" + x + "]")
run_table['service'] = run_table['instance']
run_table["duration"] = [300] * len(run_table)
run_table["ed_time"] = run_table["st_time"] + run_table["duration"]
run_table["data_type"] = ["train"] * len(run_table)
run_table = run_table[['st_time','service','instance','anomaly_type','duration','ed_time','data_type']]
run_table.reset_index(drop=True, inplace=True)

anomaly_cnt = run_table.groupby(by="anomaly_type")["instance"].count()
anomaly_cnt_dict = anomaly_cnt.to_dict()
for anomaly, group in run_table.groupby(by="anomaly_type"):
    sample_cnt = int(anomaly_cnt_dict[anomaly] * TEST_RATE)
    group_index = group.index.to_list()
    test_choices = group_index[-sample_cnt:]
    for choice in test_choices:
        run_table.loc[choice, "data_type"] = "test"

anomaly_types = [item for item in run_table.anomaly_type.unique()]
anomaly_types.sort()
print(' '.join(anomaly_types))

run_table

['OS Network' 'Docker' 'DB State' 'DB Session' 'Docker;Docker'
 'Docker CPU' 'DB State;DB State' 'DB Session;DB Session']
[DB Session;DB Session] [DB Session] [DB State;DB State] [DB State] [Docker CPU] [Docker;Docker] [Docker] [OS Network]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table["anomaly_type"] = run_table["anomaly_type"].apply(lambda x: "[" + x + "]")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table['service'] = run_table['instance']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table["duration"] = [300] * len(run_table)
A value is trying to be set 

Unnamed: 0,st_time,service,instance,anomaly_type,duration,ed_time,data_type
0,1590770220,os_021,os_021,[OS Network],300,1590770520,train
1,1590771480,os_018,os_018,[OS Network],300,1590771780,train
2,1590774960,docker_007,docker_007,[Docker],300,1590775260,train
3,1590777120,docker_005,docker_005,[Docker],300,1590777420,train
4,1590778380,docker_008,docker_008,[Docker],300,1590778680,train
...,...,...,...,...,...,...,...
105,1592079180,docker_008,docker_008,[Docker],300,1592079480,test
106,1592080020,docker_006,docker_006,[Docker CPU],300,1592080320,test
107,1592082180,docker_003,docker_003,[Docker CPU],300,1592082480,test
108,1592083560,docker_008,docker_008,[Docker],300,1592083860,test


In [6]:
run_table.to_csv(os.path.join(SAVE_ROOT, 'run_table.csv'))

### Anomalies

In [7]:
metrics_df = pd.concat(list(map(pd.read_csv, filter(lambda _: _.is_file(), Path('/data/aiops2020/AIOps挑战赛复赛数据/').glob('2020_*_*/metrics_all_in_one_CFL.csv')))))
metrics_df['kpi_name'] = metrics_df['name'].map(lambda _: _.split('##')[1])
metrics_df['cmdb_id'] = metrics_df['name'].map(lambda _: _.split('##')[0])
cmdbs_used = ['db_003', 'db_007', 'db_009', 'docker_001', 'docker_002', 'docker_003', 'docker_004', 'docker_005', 'docker_006', 'docker_007', 'docker_008', 'os_017', 'os_018', 'os_019', 'os_020', 'os_021', 'os_022']
metrics_df = metrics_df[metrics_df['cmdb_id'].isin(cmdbs_used)]
metrics_df

Unnamed: 0,name,timestamp,value,kpi_name,cmdb_id
34095,db_003##ACS,1592035200,0.02,ACS,db_003
34096,db_003##ACS,1592035260,0.01,ACS,db_003
34097,db_003##ACS,1592035320,0.01,ACS,db_003
34098,db_003##ACS,1592035380,0.01,ACS,db_003
34099,db_003##ACS,1592035440,0.02,ACS,db_003
...,...,...,...,...,...
448553,os_022##succ_rate,1592085300,1.00,succ_rate,os_022
448554,os_022##succ_rate,1592085360,1.00,succ_rate,os_022
448555,os_022##succ_rate,1592085420,1.00,succ_rate,os_022
448556,os_022##succ_rate,1592085480,1.00,succ_rate,os_022


In [8]:
print("nodes: ")
nodes = list(metrics_df.cmdb_id.unique())
nodes.sort()
print(' '.join(nodes))

node_hash = {node: index for index, node in enumerate(nodes)}
src = ['os_017', 'os_018', 'os_019', 'os_020', 'os_017', 'os_018', 'os_019', 'os_020', 'db_003', 'db_003', 'db_003', 'db_003', 'docker_005', 'docker_006', 'db_007', 'db_009', 'docker_005', 'docker_006', 'db_007', 'db_009', 'docker_007', 'docker_008', 'db_007', 'db_009', 'docker_007', 'docker_008', 'db_007', 'db_009', 'docker_001', 'docker_002', 'docker_003', 'docker_004']
tgt = ['docker_001', 'docker_002', 'docker_003', 'docker_004', 'docker_005', 'docker_006', 'docker_007', 'docker_008', 'docker_005', 'docker_006', 'docker_007', 'docker_008', 'docker_003', 'docker_003', 'docker_003', 'docker_003', 'docker_004', 'docker_004', 'docker_004', 'docker_004', 'docker_001', 'docker_001', 'docker_001', 'docker_001', 'docker_002', 'docker_002', 'docker_002', 'docker_002', 'os_022', 'os_022', 'os_021', 'os_021']
src = [node_hash[item] for item in src]
tgt = [node_hash[item] for item in tgt]
print('src: ', src)
print('tgt: ', tgt)

nodes: 
db_003 db_007 db_009 docker_001 docker_002 docker_003 docker_004 docker_005 docker_006 docker_007 docker_008 os_017 os_018 os_019 os_020 os_021 os_022
src:  [11, 12, 13, 14, 11, 12, 13, 14, 0, 0, 0, 0, 7, 8, 1, 2, 7, 8, 1, 2, 9, 10, 1, 2, 9, 10, 1, 2, 3, 4, 5, 6]
tgt:  [3, 4, 5, 6, 7, 8, 9, 10, 7, 8, 9, 10, 5, 5, 5, 5, 6, 6, 6, 6, 3, 3, 3, 3, 4, 4, 4, 4, 16, 16, 15, 15]


In [10]:
metric_dict = {}
tasks = []
pool = multiprocessing.Pool(processes=10)
for case_id, case in run_table.iterrows():
    # 故障前60个点，故障后0个点
    sample_interval = 60
    st_time = case["st_time"] - (sample_interval * 60)
    ed_time = case["ed_time"] + (sample_interval * 0)
    task = pool.apply_async(
        process_task,
        (
            metrics_df.query(f"timestamp >= {st_time} & timestamp < {ed_time}"),
            case_id,
            st_time,
            ed_time,
        ),
    )
    tasks.append((case_id, task))
pool.close()
pool.join()
for case_id, task in tasks:
    metric_dict[case_id] = task.get()

In [11]:
anomaly_path = os.path.join(SAVE_ROOT, 'anomalies')
os.makedirs(anomaly_path, exist_ok=True)
with open(os.path.join(anomaly_path, 'demo_metric.json'), "w") as w:
    json.dump(metric_dict, w)