In [1]:
from tqdm.notebook import tqdm
import multiprocessing
from typing import *
import pandas as pd
import os
import json
from pathlib import Path
from collections import defaultdict
import pickle
from concurrent.futures import ProcessPoolExecutor

DATASET = 'C'
TEST_RATE = .4
SAVE_ROOT = f'../data/{DATASET}'
os.makedirs(SAVE_ROOT, exist_ok=True)

### Run Table

In [6]:
faults = pd.read_csv(f"/SSF/data/{DATASET}/faults.csv")
print(faults.node_type.unique())
faults['instance'] = faults['root_cause_node'].apply(lambda x: x.split()[0])
faults['anomaly_type'] = faults['node_type']
faults['st_time'] = faults['timestamp']
faults

['Pod' 'Node CPU' 'Node Memory' 'Pod CPU' 'Pod Memory']


Unnamed: 0,timestamp,duration,root_cause_node,experiment_type,failure_class,node_type,instance,anomaly_type,st_time
0,1645599600,5m,ts-basic-service-1,pod-network-corrupt,pod-network-corrupt,Pod,ts-basic-service-1,Pod,1645599600
1,1645603200,5m,ts-preserve-service-1,pod-network-corrupt,pod-network-corrupt,Pod,ts-preserve-service-1,Pod,1645603200
2,1645606800,5m,node3 CPU,node-cpu-stress-lzy-k8s-3,node-cpu-stress,Node CPU,node3,Node CPU,1645606800
3,1645614000,5m,node4 Memory,node-memory-stress-lzy-k8s-4,node-memory-stress,Node Memory,node4,Node Memory,1645614000
4,1645617600,5m,node3 Memory,node-memory-stress-lzy-k8s-3,node-memory-stress,Node Memory,node3,Node Memory,1645617600
...,...,...,...,...,...,...,...,...,...
164,1646470800,5m,node4 CPU,node-cpu-stress-lzy-k8s-4,node-cpu-stress,Node CPU,node4,Node CPU,1646470800
165,1646474400,5m,node3 CPU,node-cpu-stress-lzy-k8s-3,node-cpu-stress,Node CPU,node3,Node CPU,1646474400
166,1646492400,5m,ts-user-service-0 Memory,pod-memory-stress,pod-memory-stress,Pod Memory,ts-user-service-0,Pod Memory,1646492400
167,1646506800,5m,node3 Memory,node-memory-stress-lzy-k8s-3,node-memory-stress,Node Memory,node3,Node Memory,1646506800


In [7]:
run_table = faults[['instance', 'st_time', 'anomaly_type']]
run_table["anomaly_type"] = run_table["anomaly_type"].apply(lambda x: "[" + x + "]")
run_table['service'] = run_table['instance']
run_table["duration"] = [300] * len(run_table)
run_table["ed_time"] = run_table["st_time"] + run_table["duration"]
run_table["data_type"] = ["train"] * len(run_table)
run_table = run_table[['st_time','service','instance','anomaly_type','duration','ed_time','data_type']]
run_table.reset_index(drop=True, inplace=True)

anomaly_cnt = run_table.groupby(by="anomaly_type")["instance"].count()
anomaly_cnt_dict = anomaly_cnt.to_dict()
for anomaly, group in run_table.groupby(by="anomaly_type"):
    sample_cnt = int(anomaly_cnt_dict[anomaly] * TEST_RATE)
    group_index = group.index.to_list()
    test_choices = group_index[-sample_cnt:]
    for choice in test_choices:
        run_table.loc[choice, "data_type"] = "test"

anomaly_types = [item for item in run_table.anomaly_type.unique()]
anomaly_types.sort()
print(' '.join(anomaly_types))

run_table

[Node CPU] [Node Memory] [Pod CPU] [Pod Memory] [Pod]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table["anomaly_type"] = run_table["anomaly_type"].apply(lambda x: "[" + x + "]")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table['service'] = run_table['instance']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run_table["duration"] = [300] * len(run_table)
A value is trying to be set 

Unnamed: 0,st_time,service,instance,anomaly_type,duration,ed_time,data_type
0,1645599600,ts-basic-service-1,ts-basic-service-1,[Pod],300,1645599900,train
1,1645603200,ts-preserve-service-1,ts-preserve-service-1,[Pod],300,1645603500,train
2,1645606800,node3,node3,[Node CPU],300,1645607100,train
3,1645614000,node4,node4,[Node Memory],300,1645614300,train
4,1645617600,node3,node3,[Node Memory],300,1645617900,train
...,...,...,...,...,...,...,...
164,1646470800,node4,node4,[Node CPU],300,1646471100,test
165,1646474400,node3,node3,[Node CPU],300,1646474700,test
166,1646492400,ts-user-service-0,ts-user-service-0,[Pod Memory],300,1646492700,test
167,1646506800,node3,node3,[Node Memory],300,1646507100,test


In [8]:
run_table.to_csv(os.path.join(SAVE_ROOT, 'run_table.csv'))

### Anomalies

In [2]:
def load_pkl(filepath):
    with open(filepath, 'rb') as f:
        data = pickle.load(f)
    return data

metrics_df = load_pkl('/SSF/data/C/metrics.pkl')
metrics_df

Unnamed: 0,serviceName,timestamp,value,metric_kind,name,pod,metric_type
0,ts-admin-basic-info-service,1645596000,4.0,count,ts-admin-basic-info-service##count,,count
1,ts-admin-basic-info-service,1645596060,4.0,count,ts-admin-basic-info-service##count,,count
2,ts-admin-basic-info-service,1645596120,4.0,count,ts-admin-basic-info-service##count,,count
3,ts-admin-basic-info-service,1645596180,4.0,count,ts-admin-basic-info-service##count,,count
4,ts-admin-basic-info-service,1645596240,4.0,count,ts-admin-basic-info-service##count,,count
...,...,...,...,...,...,...,...
62974086,,1646510760,7872.0,,ts-verification-code-service-1##jvm_class_loaded,,jvm_class_loaded
62974087,,1646510820,7872.0,,ts-verification-code-service-1##jvm_class_loaded,,jvm_class_loaded
62974088,,1646510880,7872.0,,ts-verification-code-service-1##jvm_class_loaded,,jvm_class_loaded
62974089,,1646510940,7872.0,,ts-verification-code-service-1##jvm_class_loaded,,jvm_class_loaded


In [9]:
metrics_df['cmdb'] = metrics_df['name'].map(lambda _: _.split('##')[0])
cmdb_excluded = ['jaeger-query', 'unknown']
metrics_df = metrics_df[~metrics_df['cmdb'].isin(cmdb_excluded)]
metrics_df = metrics_df.rename(columns={"metric_type": "kpi_name", "cmdb": "cmdb_id"})
metrics_df

Unnamed: 0,serviceName,timestamp,value,metric_kind,name,pod,kpi_name,cmdb_id
0,ts-admin-basic-info-service,1645596000,4.0,count,ts-admin-basic-info-service##count,,count,ts-admin-basic-info-service
1,ts-admin-basic-info-service,1645596060,4.0,count,ts-admin-basic-info-service##count,,count,ts-admin-basic-info-service
2,ts-admin-basic-info-service,1645596120,4.0,count,ts-admin-basic-info-service##count,,count,ts-admin-basic-info-service
3,ts-admin-basic-info-service,1645596180,4.0,count,ts-admin-basic-info-service##count,,count,ts-admin-basic-info-service
4,ts-admin-basic-info-service,1645596240,4.0,count,ts-admin-basic-info-service##count,,count,ts-admin-basic-info-service
...,...,...,...,...,...,...,...,...
62974086,,1646510760,7872.0,,ts-verification-code-service-1##jvm_class_loaded,,jvm_class_loaded,ts-verification-code-service-1
62974087,,1646510820,7872.0,,ts-verification-code-service-1##jvm_class_loaded,,jvm_class_loaded,ts-verification-code-service-1
62974088,,1646510880,7872.0,,ts-verification-code-service-1##jvm_class_loaded,,jvm_class_loaded,ts-verification-code-service-1
62974089,,1646510940,7872.0,,ts-verification-code-service-1##jvm_class_loaded,,jvm_class_loaded,ts-verification-code-service-1


In [11]:
nodes = list(metrics_df.cmdb_id.unique())
nodes.sort()
print(f"{len(nodes)} nodes: \n{' '.join(nodes)}")
node_hash = {node: index for index, node in enumerate(nodes)}

import sys
sys.path.insert(0, '/SSF/DejaVu-Omni')
from failure_dependency_graph.parse_yaml_graph_config import parse_yaml_graph_config

graph = parse_yaml_graph_config('/SSF/data/C/graphs/graph_1645779600.yml', Path('tmp'))
src, tgt = [], []
for i, j in graph.edges:
    if i in nodes and j in nodes:
        src.append(j)
        tgt.append(i)
src = [node_hash[item] for item in src]
# tgt = [node_hash[item] for item in tgt]
print('src: ', src)
print('tgt: ', tgt)

171 nodes: 
node1 node2 node3 node4 ts-admin-basic-info-service ts-admin-basic-info-service-0 ts-admin-order-service ts-admin-order-service-0 ts-admin-route-service ts-admin-route-service-0 ts-admin-travel-service ts-admin-travel-service-0 ts-admin-user-service ts-admin-user-service-0 ts-assurance-mongo ts-assurance-mongo-0 ts-assurance-service ts-assurance-service-0 ts-auth-mongo ts-auth-mongo-0 ts-auth-service ts-auth-service-0 ts-auth-service-1 ts-avatar-service-0 ts-avatar-service-1 ts-basic-service ts-basic-service-0 ts-basic-service-1 ts-basic-service-2 ts-cancel-service ts-cancel-service-0 ts-cancel-service-1 ts-config-mongo ts-config-mongo-0 ts-config-service ts-config-service-0 ts-config-service-1 ts-consign-mongo ts-consign-mongo-0 ts-consign-price-mongo ts-consign-price-mongo-0 ts-consign-price-service ts-consign-price-service-0 ts-consign-price-service-1 ts-consign-service ts-consign-service-0 ts-consign-service-1 ts-contacts-mongo ts-contacts-mongo-0 ts-contacts-service ts

In [12]:
import sys
sys.path.insert(0, '/SSF/DejaVu-Omni/DiagFusion/')
from detector.k_sigma import Ksigma

def process_task(df, case_id, st_time, ed_time):
    detector = Ksigma()
    rt = []
    scheduler = tqdm(total=len(df), desc=f"case:{case_id}, detecting")
    for instance, ins_group in df.groupby(by="cmdb_id"):
        for kpi, kpi_group in ins_group.groupby(by="kpi_name"):
            res = detector.detection(kpi_group, "value", st_time, ed_time)
            if res[0] is True:
                rt.append([int(res[1]), instance, kpi, res[2]])
        scheduler.update(len(ins_group))
    return rt

In [13]:
metric_dict = {}
tasks = []
pool = multiprocessing.Pool(processes=10)
for case_id, case in run_table.iterrows():
    # 故障前60个点，故障后0个点
    sample_interval = 60
    st_time = case["st_time"] - (sample_interval * 60)
    ed_time = case["ed_time"] + (sample_interval * 0)
    task = pool.apply_async(
        process_task,
        (
            metrics_df.query(f"timestamp >= {st_time} & timestamp < {ed_time}"),
            case_id,
            st_time,
            ed_time,
        ),
    )
    tasks.append((case_id, task))
pool.close()
pool.join()
for case_id, task in tasks:
    metric_dict[case_id] = task.get()

In [14]:
anomaly_path = os.path.join(SAVE_ROOT, 'anomalies')
os.makedirs(anomaly_path, exist_ok=True)
with open(os.path.join(anomaly_path, 'demo_metric.json'), "w") as w:
    json.dump(metric_dict, w)