In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import networkx as nx
import json
import pickle

In [2]:
all_data=pd.read_csv("../data/calling_relationships_monitoring.csv")

In [3]:
all_data

Unnamed: 0,TimeStamp,SourceName,DestinationName,Workload,FailCount
0,2022-03-23 15:55:00,frontend,adservice,664.000000,0.000000
1,2022-03-23 15:55:00,frontend,checkoutservice,54.666667,0.000000
2,2022-03-23 15:55:00,frontend,shippingservice,250.666667,0.000000
3,2022-03-23 15:55:00,frontend,currencyservice,3653.333333,0.000000
4,2022-03-23 15:55:00,frontend,productcatalogservice,5172.000000,0.000000
...,...,...,...,...,...
281355,2022-04-04 20:59:00,checkoutservice,productcatalogservice,104.000000,0.000000
281356,2022-04-04 20:59:00,checkoutservice,cartservice,98.666667,0.000000
281357,2022-04-04 20:59:00,recommendationservice,productcatalogservice,1001.333333,20.000000
281358,2022-04-04 20:59:00,cartservice,redis-cart,736.000000,0.000000


In [4]:
ServicePairs = {}
for g in all_data.groupby(['SourceName', 'DestinationName']):
    ServicePairs[g[0]] = g[1].reset_index(drop=True)
    ServicePairs[g[0]]['YesterFailCount'] = ServicePairs[g[0]]['FailCount'].shift(1440).fillna(0)

# Anomaly Detection

## Here is a simple anomaly detector, only for this simulation dataset as a showpiece. In practice, we can use some more advanced anomaly detectors to achieve a higher anomaly detection performance.

In [5]:
ServicePairsAnomalies = {}
for pair in ServicePairs:
    ServicePairsAnomalies[pair] = ServicePairs[pair]['FailCount']>0
    #ServicePairsAnomalies[pair] = pd.Series(np.all([ServicePairsAnomalies[pair],ServicePairsAnomalies[pair].shift(1), ServicePairsAnomalies[pair].shift(2)],axis=0))

In [6]:
keys = list(ServicePairs.keys())

In [7]:
datalen = len(ServicePairs[keys[0]])

In [8]:
SystemAnomalies = pd.concat([ServicePairsAnomalies[k] for k in ServicePairsAnomalies], axis =1)

In [9]:
SystemAnomalies.columns = [k for k in ServicePairsAnomalies]

In [10]:
SystemAnomalies.any(axis=1).value_counts()

True     11592
False     5993
dtype: int64

In [11]:
SystemAnomalies_any = SystemAnomalies.any(axis=1)

# Issue Extraction

In [12]:
THRESHOLD = 0.9
min_sample = 1
Topologies = []
for t in range(9, datalen-1):
    if SystemAnomalies_any.loc[t]:
        anomalypairs = [k for k in keys if SystemAnomalies.loc[t][k]]
        # online boutiques keep 1-edge topologies
        if len(anomalypairs) == 1:
            topoFea = {}
            topoFea['time'] = t
            topoFea['edges_info'] = []
            edge = anomalypairs[0]
            topoedge ={}
            topoedge['src'] = edge[0]
            topoedge['des'] = edge[1]
            topoedge['FailCount'] = ServicePairs[edge].loc[t-9:t]['FailCount'].tolist()
            topoedge['Workload'] = ServicePairs[edge].loc[t-9:t]['Workload'].tolist()
            topoedge['YesterFailCount'] = ServicePairs[edge].loc[t-9:t]['YesterFailCount'].tolist()
                        
            topoFea['edges_info'].append(topoedge)
            
            topoFea['MaxFail'] = topoedge['FailCount'][-1]
            topoFea['nodes'] = [edge[0], edge[1]]
            topoFea['TimeStamp'] = ServicePairs[edge].loc[t]['TimeStamp']
            
            Topologies.append(topoFea)
        elif len(anomalypairs) > 1:
            point_list = [ServicePairs[pair].loc[t-9:t]['FailCount'].tolist() for pair in anomalypairs]

            distance_matrix = np.corrcoef(point_list)
            distance_matrix[np.isnan(distance_matrix)] = 0
            idx = [idx for idx in range(len(distance_matrix))]
            distance_matrix[idx, idx] = 1
            distance_matrix = np.abs(distance_matrix)
            distance_matrix[distance_matrix >= THRESHOLD] = 1
            distance_matrix[distance_matrix < THRESHOLD] = 2
            y_pred = DBSCAN(eps=1.5, min_samples=min_sample, metric='precomputed').fit_predict(distance_matrix).tolist()

            #print(t,y_pred)
            
            clusters = [[] for i in range(max(y_pred)+1)]
            for i, ano_pair in enumerate(anomalypairs):
                clusters[y_pred[i]].append(ano_pair)
            
            for cluster in clusters:
                g = nx.Graph()
                di_g = nx.DiGraph()
                
                g.add_edges_from(cluster)
                di_g.add_edges_from(cluster)
                
                for sub_g in nx.connected_components(g):
                    topoFea = {}
                    topoFea['time'] = t
                    topoFea['edges_info'] = []
                    MaxFail = 0
                    for edge in list(di_g.subgraph(sub_g).edges):
                        topoedge ={}
                        topoedge['src'] = edge[0]
                        topoedge['des'] = edge[1]
                        topoedge['FailCount'] = ServicePairs[edge].loc[t-9:t]['FailCount'].tolist()
                        topoedge['Workload'] = ServicePairs[edge].loc[t-9:t]['Workload'].tolist()
                        topoedge['YesterFailCount'] = ServicePairs[edge].loc[t-9:t]['YesterFailCount'].tolist()
                        
                        topoFea['edges_info'].append(topoedge)
                        MaxFail = max(MaxFail, topoedge['FailCount'][-1])
                    
                    topoFea['MaxFail'] = MaxFail
                    topoFea['nodes'] = list(sub_g)
                    topoFea['TimeStamp'] = ServicePairs[edge].loc[t]['TimeStamp']
                    Topologies.append(topoFea)

In [13]:
len(Topologies)

26132

In [14]:
with open('../data/raw_topoloies.pkl', 'wb') as f:
    pickle.dump(Topologies, f)