In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import networkx as nx
import pickle
import json

In [2]:
with open('../data/raw_topoloies.pkl', 'rb') as f:
    Topologies = pickle.load(f)

In [3]:
len(Topologies)

26132

# LABEL RULE 1

## process injected fault

In [4]:
inject_df = pd.read_csv('../data/injected_faults.csv')

In [5]:
for topo in Topologies:
    topo['y'] = 0

In [6]:
for index, inject_item in inject_df.iterrows():
    corressponding_topo_i_s = [i for i, topo in enumerate(Topologies) if topo['time']>=inject_item['time'] and topo['time']<inject_item['time']+15]
    if inject_item['inject_type'] == 'excessive flow':
        MaxFail, MaxFail_ci = 0, None
        for ci in corressponding_topo_i_s:
            if Topologies[ci]['MaxFail'] > MaxFail:
                MaxFail_ci = ci
                MaxFail = Topologies[ci]['MaxFail']
            Topologies[ci]['y'] = -1
        if MaxFail_ci is not None:
            Topologies[MaxFail_ci]['y'] = 1
            Topologies[MaxFail_ci]['root_cause'] = 'All'
            Topologies[MaxFail_ci]['root_cause_type'] = inject_item['inject_type']
    else:
        MaxFail, MaxFail_ci = 0, None
        for ci in corressponding_topo_i_s:
            if Topologies[ci]['MaxFail'] > MaxFail and inject_item['inject_serive'] in Topologies[ci]['nodes']:
                MaxFail_ci = ci
                MaxFail = Topologies[ci]['MaxFail']
            Topologies[ci]['y'] = -1
        if MaxFail_ci is not None:
            Topologies[MaxFail_ci]['y'] = 1
            Topologies[MaxFail_ci]['root_cause'] = inject_item['inject_serive']
            Topologies[MaxFail_ci]['root_cause_type'] = inject_item['inject_type']

## process platform faults

In [7]:
platform_fault_df = pd.read_csv('../data/platform_faults.csv')

In [8]:
for index, platform_fault in platform_fault_df.iterrows():
    corressponding_topo_i_s = [i for i, topo in enumerate(Topologies) if pd.to_datetime(topo['TimeStamp'])>=pd.to_datetime(platform_fault['BeginTimeStamp']) and pd.to_datetime(topo['TimeStamp'])<=pd.to_datetime(platform_fault['EndTimeStamp'])]
    MaxFail, MaxFail_ci = 0, None
    for ci in corressponding_topo_i_s:
        if Topologies[ci]['MaxFail'] > MaxFail and platform_fault['service'] in Topologies[ci]['nodes'] and Topologies[ci]['y']!=1:
            MaxFail_ci = ci
            MaxFail = Topologies[ci]['MaxFail']
        if Topologies[ci]['y'] != 1:
            Topologies[ci]['y'] = -1
    Topologies[MaxFail_ci]['y'] = 1
    Topologies[MaxFail_ci]['root_cause'] = platform_fault['service']
    Topologies[MaxFail_ci]['root_cause_type'] = 'platform_fault'
    print(MaxFail_ci)

21304
22570


# LABEL RULE 2:

## The faults of these two services would not affect the key functionality since a degrade policy is implemented for them.

In [9]:
for topo in Topologies:
    if topo['y'] == 1 and (topo['root_cause'] == 'adservice' or topo['root_cause'] == 'emailservice'):
        topo['y'] = 0

In [10]:
pd.Series([topo['y'] for topo in Topologies]).value_counts()

-1    20012
 0     5417
 1      703
dtype: int64

# LABEL RULE 3:

## Some performance issues take a little effect when the system is under a low workload (e.g, at 00:00 - 06:00) so they are not considered as incidents

In [11]:
for topo in Topologies:
    if topo['y'] == 1 and (topo['root_cause_type'] == 'cpu' or topo['root_cause_type'] == 'latency') and topo['MaxFail']<50:
        topo['y'] = 0

In [12]:
pd.Series([topo['y'] for topo in Topologies]).value_counts()

-1    20012
 0     5494
 1      626
dtype: int64

In [13]:
with open('../data/issue_topoloies.pkl', 'wb') as f:
    pickle.dump(Topologies, f)