In [1]:
import numpy as np
np.random.seed(42)
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from subprocess import check_output
import psutil
import platform
import os
import pandas as pd
import time
import scipy
from scipy import stats
import sys
%matplotlib inline
print("Checking files in data location ...")
print(check_output(["ls", "../data"]).decode("utf8"))
path_parent = os.path.dirname(os.getcwd())
data_dir = os.path.join(path_parent,'data')
processed_dir = os.path.join(data_dir,'processed')
logs = os.path.join(data_dir,'logs')
print(data_dir)
print(processed_dir)

df_clean = pd.read_csv(os.path.join(processed_dir,"clean.csv"))
df_anomaly = pd.read_csv(os.path.join(processed_dir,"anomaly.csv"))
df_audsome = pd.read_csv(os.path.join(processed_dir,"audesome_clearn.csv"))
df_audsome_anomaly = pd.read_csv(os.path.join(processed_dir,"audsome_anomaly.csv"))
print("Clean data shape: {}".format(df_clean.shape))
print("Anomaly data shape: {}".format(df_anomaly.shape))
print("Clean AUDSOME data shape: {}".format(df_audsome.shape))
print("Anomaly AUDSOME data shape: {}".format(df_audsome_anomaly.shape))

Checking files in data location ...
logs
pr_data_anomalies(1h).csv
pr_data_anomalies(80minutes).csv
pr_data_audesome_anomalies(65m).csv
pr_data_audesome_clean(1h).csv
pr_data_clean(1h).csv
processed

/Users/Gabriel/Documents/workspaces/Event-Detection-Engine/experiments/ede_exp/data
/Users/Gabriel/Documents/workspaces/Event-Detection-Engine/experiments/ede_exp/data/processed
Clean data shape: (3600, 62)
Anomaly data shape: (4800, 62)
Clean AUDSOME data shape: (3600, 62)
Anomaly AUDSOME data shape: (3900, 62)


In [2]:
# Logs
anomaly_logs_dir = os.path.join(logs, 'distributed')
print(anomaly_logs_dir)

/Users/Gabriel/Documents/workspaces/Event-Detection-Engine/experiments/ede_exp/data/logs/anomaly_exp


In [3]:
import glob
from pygrok import Grok

def setup_grok(pattern):
    grok = Grok(pattern)
    return grok

def parse_logs(file,
               type=None):
    if type == 'copy':
        pattern = '%{NUMBER:unixtime}  anomalies.py:%{NUMBER:linenumber} 	%{LOGLEVEL:loglevel}     %{WORD:status} %{WORD:anomaly_name} with options %{GREEDYDATA:settings} and uuid %{GREEDYDATA:uuid}'
    elif type == 'cpu':
        pattern = '%{NUMBER:unixtime}  anomalies.py:%{NUMBER:linenumber}  	%{LOGLEVEL:loglevel}     %{WORD:status} %{WORD:anomaly_name} with options %{GREEDYDATA:settings} and uuid %{GREEDYDATA:uuid}'
        # pattern = '%{NUMBER:unixtime}  anomalies.py:%{NUMBER:linenumber}%{GREEDYDATA}%{LOGLEVEL:loglevel}     %{WORD:status} %{WORD:anomaly_name} with options %{GREEDYDATA:settiongs} and uuid %{GREEDYDATA:uuid}'
    else:
        pattern = '%{NUMBER:unixtime}  anomalies.py:%{NUMBER:linenumber} 	%{LOGLEVEL:loglevel}     %{WORD:status} %{WORD:anomaly_name} with options %{GREEDYDATA:settings} and uuid %{GREEDYDATA:uuid}'
    grok = setup_grok(pattern)
    with open(file, 'r') as log:
        lines = log.readlines()

    parsed = []
    for line in lines:
        match = grok.match(line)
        if match is None:
            pattern2 = '%{NUMBER:unixtime}  anomalies.py:%{NUMBER:linenumber} 	%{LOGLEVEL:loglevel}     %{WORD:status} %{WORD:anomaly_name} of uuid %{GREEDYDATA:uuid} for'
            grok2 = setup_grok(pattern2)
            match2 = grok2.match(line)
            if match2 is None:
                pass
            else:
                match = match2
        parsed.append(match)
        print(match)
    return parsed

In [4]:
# Log files
copy_logs = os.path.join(anomaly_logs_dir, 'copy-out.log')
cpu_logs = os.path.join(anomaly_logs_dir, 'cpu_overload-out.log')
ddot_logs = os.path.join(anomaly_logs_dir, 'ddot-out.log')
mem_logs = os.path.join(anomaly_logs_dir, 'mem_eater-out.log')
dummy_logs = os.path.join(anomaly_logs_dir, 'mem_eater-out.log')

# Parsed logs
copy_list = parse_logs(copy_logs,type='copy')
cpu_list = parse_logs(cpu_logs,type='cpu')
mem_list = parse_logs(mem_logs)
ddot_list = parse_logs(ddot_logs)
dummy_list  = parse_logs(dummy_logs)

# lines = file1.readlines()
# for line in lines:
#     print(line.strip())

{'unixtime': '1607516375.369525', 'linenumber': '193', 'loglevel': 'INFO', 'status': 'Started', 'anomaly_name': 'copy', 'settings': '[unit gb, multiplier 3, remove 1, time_out 20]', 'uuid': '2c6bb32e-3e2b-407a-821f-f8ea898055a1'}
{'unixtime': '1607516375.371002', 'linenumber': '174', 'loglevel': 'INFO', 'status': 'Started', 'anomaly_name': 'Generating_large_file', 'settings': '[size 3221225472, multiplier 3]', 'uuid': 'gb'}
{'unixtime': '1607516396.182727', 'linenumber': '179', 'loglevel': 'INFO', 'status': 'Finished', 'anomaly_name': 'Generating_large_file', 'settings': '[size 3221225472, multiplier 3]', 'uuid': 'gb'}
{'unixtime': '1607516456.766483', 'linenumber': '213', 'loglevel': 'INFO', 'status': 'Finished', 'anomaly_name': 'copy', 'settings': '[unit gb, multiplier 3, remove 1, time_out 20]', 'uuid': '2c6bb32e-3e2b-407a-821f-f8ea898055a1'}
{'unixtime': '1607516456.954106', 'linenumber': '193', 'loglevel': 'INFO', 'status': 'Started', 'anomaly_name': 'copy', 'settings': '[unit gb,

In [5]:
def preflight_labels(parsed_list):
    anomalies = {}
    for e in parsed_list:
        if e is None:
            pass
        else:
            # print(e)
            # print(e['uuid'])
            if e['uuid'] in anomalies.keys():
                pass
            else:
                anomalies[e['uuid']] = {}
            if e['status'] == 'Started':
                anomalies[e['uuid']]['start'] = e['unixtime']
            elif e['status'] == 'Finished':
                anomalies[e['uuid']]['stop'] = e['unixtime']
            elif e['status'] == 'Finised': # todo fix typo for memoryv2
                anomalies[e['uuid']]['stop'] = e['unixtime']
            elif e['status'] == 'Modifier' :
                if 'modifiers' in anomalies[e['uuid']].keys():
                    pass
                else:
                    anomalies[e['uuid']]['modifiers'] = []
                anomalies[e['uuid']]['modifiers'].append(e['unixtime'])
    # print(anomalies)
    return anomalies


pre_cpu = preflight_labels(cpu_list)
pre_mem = preflight_labels(mem_list)
pre_copy =preflight_labels(copy_list)
pre_dummy = preflight_labels(dummy_list)
pre_dummy = preflight_labels(ddot_list)

In [15]:
print(type(pre_mem))


<class 'dict'>


In [9]:
# Add labels to data
def label(df, pre, label='target'):
    print(pre)
    # for k,v in pre.items():
    #     df['target'] = df['time'].apply(lambda x: label if (float(v['start']) <= x <= float(v['stop'])) else 0)
    #     df.loc[((df['time'] >= float(v['start']) | (df['Value_2'] <= float(v['stop'])), 'High_Value_Ind'] = 'Y'
    df[label] = 0
    labels = []
    for k,v in pre.items():
        for k2, v2 in df.iterrows():
            if (v2['time'] >= float(v['start']) and v2['time'] <= float(v['start'])):
                labels.append("ok")
        # df.loc[df['time'] >= v['start'].astype(np.float), 'target'] = label
        # df.loc[df['time'] <= v['stop'].astype(np.float), 'target'] = label
    print(len(labels))
    return df
df_labeled = label(df_anomaly, pre_cpu, 'cpu')

{'db3e3197-f547-4781-9829-185157e5636a': {'start': '1607517460.242392', 'stop': '1607517520.362323'}, '9aefe458-0d60-4bf0-afc4-e4730fcffa3e': {'start': '1607517877.308268', 'stop': '1607517937.422405'}, '5d51250f-0b16-4512-911d-3cfc54b75719': {'start': '1607517957.846296', 'stop': '1607518017.954151'}, '990c90a3-0308-4a28-94f4-7379b07fa580': {'start': '1607518466.229090', 'stop': '1607518526.339345'}, '0e5b7ec0-04da-4c24-be92-0c00f996055d': {'start': '1607518860.838976', 'stop': '1607518920.954849'}, '7082b00f-80da-45c5-8971-b23d01b19df4': {'start': '1607519907.887355', 'stop': '1607519967.994844'}, '01995677-468c-407e-8229-bfd4699c1a8a': {'start': '1607520483.497519', 'stop': '1607520543.618617'}, 'd959eb7f-dd71-4703-86b0-e6a289c6df61': {'start': '1607520770.335384', 'stop': '1607520830.470035'}}
0


In [12]:
df_labeled['target']
df_labeled.apply(lambda x: x.nunique())
# print('done')


node_context_switches_total_10.211.55.101:9100    4800
node_cpu_seconds_total_10.211.55.101:9100         2968
node_entropy_available_bits_10.211.55.101:9100    1170
node_filefd_allocated_10.211.55.101:9100             5
node_filesystem_avail_bytes_10.211.55.101:9100     497
                                                  ... 
node_vmstat_pgmajfault_10.211.55.101:9100          280
node_vmstat_pgpgin_10.211.55.101:9100              282
node_vmstat_pgpgout_10.211.55.101:9100             857
time                                              4792
target                                               1
Length: 63, dtype: int64