# EDA & Anomaly Detection Test - Isolation Forest

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import util
import json
from sklearn.ensemble import IsolationForest

matplotlib.interactive('True')
STATE = 42
SERVICE = 'ServiceK'
LAMBDA = 'LambdaA'
METRIC = 'ConcurrentExecutions' # Should be 'Duration' or 'ConcurrentExecutions'
METRIC_SLICE = None
# METRIC_SLICE = [5000, 8000] # Optional array for using only a slice of the whole metric e.g. [5000, 8000] will use df[5000:8000] later on


# Metric conversion from exported CloudWatch JSON to pandas DataFrames
metrics = util.json_to_pandas(f'../ExportedMetrics/{SERVICE}/{LAMBDA}.json')
for metric in metrics:
    print(f'{metric.upper()}:\n',metrics[metric], '\n\n')
    


DURATION:
                      Timestamps   Values
0     2020-10-17 14:15:00+00:00  160.610
1     2020-10-17 14:20:00+00:00  246.250
2     2020-10-17 14:25:00+00:00  155.695
3     2020-10-17 14:30:00+00:00  171.775
4     2020-10-17 14:35:00+00:00  194.660
...                         ...      ...
17893 2020-12-19 10:07:00+00:00  210.450
17894 2020-12-19 10:12:00+00:00  224.440
17895 2020-12-19 10:17:00+00:00  222.730
17896 2020-12-19 10:22:00+00:00  302.740
17897 2020-12-19 10:27:00+00:00  202.990

[17898 rows x 2 columns] 




### Importing microservice release dates

In [2]:
releases = util.load_releases('../ExportedMetrics/releases.json')
releases.head(10)

Unnamed: 0,ServiceNames,Timestamps
0,ServiceA,2020-10-15 13:09:02.003799915+00:00
1,ServiceHIK,2020-10-15 14:18:49.004899979+00:00
2,ServiceB,2020-10-15 14:21:56.005199909+00:00
3,ServiceB,2020-10-19 08:30:09.000400066+00:00
4,ServiceHIK,2020-10-21 13:49:47.005199909+00:00
5,ServiceB,2020-10-22 12:20:14.000600100+00:00
6,ServiceCD,2020-10-23 14:04:43.000400066+00:00
7,ServiceB,2020-10-28 09:16:09.000900030+00:00
8,ServiceB,2020-10-28 11:33:24.003599882+00:00
9,ServiceA,2020-10-29 09:56:09.004199982+00:00


In [3]:
# Some microservices are grouped together during release
release_service_map = {
    'ServiceA': 'ServiceA',
    'ServiceB': 'ServiceB',
    'ServiceC': 'ServiceCD',
    'ServiceD': 'ServiceCD',
    'ServiceE': 'ServiceEF',
    'ServiceF': 'ServiceEF',
    'ServiceH': 'ServiceHIK',
    'ServiceI': 'ServiceHIK',
    'ServiceJ': 'ServiceJ',
    'ServiceK': 'ServiceHIK'
}

service_releases = releases.loc[releases['ServiceNames'] == release_service_map[SERVICE]]

In [4]:
try:
    df = metrics[METRIC]
except:
    import winsound
    winsound.Beep(700, 200)
    winsound.Beep(500, 200)
    winsound.Beep(250, 600)
        
# df_test = df[5000:8000]
df_test = df.copy()

try:
    if (METRIC_SLICE is not None):
        df_test = df_test[METRIC_SLICE[0]:METRIC_SLICE[1]]
except NameError: # No value for METRIC_SLICE
    pass

NameError: name 'df' is not defined

### Adding feature `PostRelease` for rows right after a microservice deployment

In [None]:
print(df_test.head(1))
print(df_test.tail(1))

In [None]:
df_test = util.calculate_postrelease_feature(df_test, service_releases)

In [None]:
df_test.loc[df_test['PostRelease'] == 1]

In [None]:
plt.plot(df_test.Timestamps, df_test.Values)
plt.show()

### STL Decomposition

* Note: The trend around the start and end of the time series is extrapolated, otherwise it would be missing.

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
decompose_result = seasonal_decompose(df_test.Values, period=288, extrapolate_trend='freq')

fig, axs = plt.subplots(4,1, figsize=(15,10), sharex=False)
axs[0].plot(df_test.Timestamps, df_test.Values)
axs[0].set_title('Original values')
axs[1].plot(df_test.Timestamps, decompose_result.trend)
axs[1].set_title('Trend')
axs[2].plot(df_test.Timestamps, decompose_result.seasonal)
axs[2].set_title('Seasonal')
axs[3].plot(df_test.Timestamps, decompose_result.resid)
axs[3].set_title('Residual')
plt.tight_layout()
plt.show()

In [None]:
df_test['Trend_Values'] = decompose_result.trend
df_test['Seasonal_Values'] = decompose_result.seasonal
df_test['Residual_Values'] = decompose_result.resid
df_test.head()

### Model fitting for original values and decomposed residual values

In [None]:
model = IsolationForest(
    max_features = 1.0,
    n_estimators = 50,
    max_samples = 'auto',
    contamination = 0.01,
    random_state = STATE
)
model.fit(df_test[['Values']])
df_test['Values_Scores'] = model.decision_function(df_test[['Values']])
df_test['Values_Inliers'] = model.predict(df_test[['Values']])

model.fit(df_test[['Residual_Values']])
df_test['Residual_Values_Scores'] = model.decision_function(df_test[['Residual_Values']])
df_test['Residual_Values_Inliers'] = model.predict(df_test[['Residual_Values']])

print(df_test.head())

In [None]:
values_anomaly_counts = df_test.Values_Inliers.value_counts()[-1]
residual_values_anomaly_counts = df_test.Residual_Values_Inliers.value_counts()[-1]
print(f'Anomalous data points from raw values: {values_anomaly_counts} / {len(df_test["Values_Inliers"])}')
print(f'Anomalous data points from decomposed residual values: {residual_values_anomaly_counts} / {len(df_test["Residual_Values_Inliers"])}')

In [None]:
values_anomalies = df_test.loc[df_test.Values_Inliers == -1]
residual_values_anomalies = df_test.loc[df_test.Residual_Values_Inliers == -1]
release_points = df_test.loc[df_test.PostRelease == 1]

from matplotlib.pyplot import figure
plt.figure(figsize=(30,15))
plt.plot(df_test.Timestamps, df_test.Values)
plt.plot(values_anomalies.Timestamps, values_anomalies.Values, 'o', color='green')
plt.plot(residual_values_anomalies.Timestamps, residual_values_anomalies.Values, 'x', color='orange')

for release in release_points.Timestamps:
    plt.axvline(release, color='purple')

plt.title(f'Isolation forest anomaly detection - {SERVICE}, {LAMBDA} - {METRIC}')
plt.legend(['Values', 'Anomalies from raw values', 'Anomalies from decomposed residual values', 'Release dates'])


# Output .png & .eps files
import os

savedir = f'{os.getcwd()}\\output\\{SERVICE}\\{LAMBDA}'
if not os.path.exists(savedir):
    os.makedirs(savedir)
plt.savefig(f'{savedir}\\{METRIC}.eps')
plt.savefig(f'{savedir}\\{METRIC}.png')


plt.show()

In [None]:
import winsound
winsound.Beep(200, 100)
winsound.Beep(300*2, 100)
winsound.Beep(300*2, 100)
winsound.Beep(300*2, 100)
winsound.Beep(300*2, 100)