## TODO


### Imports

In [None]:
import pandas
import numpy as np
from ipywidgets import IntProgress
from IPython.display import display
np.set_printoptions(suppress=True)
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
plt.rcParams['figure.figsize'] = [8,5]
np.set_printoptions(suppress=True)

import os
import shutil
from matplotlib.ticker import StrMethodFormatter, NullFormatter
import dictdiffer

from tqdm.notebook import tnrange as nrange
from tqdm.notebook import tqdm
from datetime import datetime
import re
import traceback

In [None]:
def set_pandas_display_options() -> None:
    """Set pandas display options."""
    # Ref: https://stackoverflow.com/a/52432757/
    display = pandas.options.display

    display.max_columns = 1000
    display.max_rows = 100
    display.max_colwidth = 199
    display.width = None
    display.float_format = '{:.2f}'.format
    # display.precision = 2  # set as needed
set_pandas_display_options()

In [None]:
path = "E:\\Studium\\10_Semester\\Masterarbeit\\Deployment\\Kafka\\results\\setup"
output = "E:\\Studium\\10_Semester\\Masterarbeit\\Deployment\\Kafka\\evaluation\\"
experiment = 3
path = path + str(experiment) + os.path.sep
output = output + str(experiment) + os.path.sep
runs = [x for x in os.listdir(path)]

## Filter

In [None]:
def dropFirstXRows(input, x):
    return input.drop(np.arange(0,x))

def removeNaN(array, array2):
    helper = np.logical_not(np.isnan(array))
    return array[helper], array2[helper]

In [None]:
def filterLastData(generated, received):
    lastOffset = received["Kafka.Offset"].to_numpy()[-1]
    return generated[generated["Kafka.Offset"] <= lastOffset]

In [None]:
def extractDate(line):
    return int(datetime.strptime(" ".join(line.split(" ")[0:2]),"%Y-%m-%d %H:%M:%S").timestamp()) * 1000

def transitionsFromFile(path):
    f = open(path, "r")
    start = False
    end = False
    name = None
    last = 0
    transitions = []
    for line in f.readlines():
        if(line.startswith("Starting transition to state")):
            name = line.split(" ")[4][:-1]
            start = True
        elif(line.startswith("Doing state")):
            end = True
        elif(start):
            last = extractDate(line)
            start = False
        elif(end):
            transitions.append((name, last, extractDate(line)))
            end = False
    f.close()
    return transitions

In [None]:
def getBatches(file):
    result = []
    f = open(file, "r")
    pattern = re.compile("Received [0-9]+ records for topic t[0-9]{1}$")
    for line in f.readlines():
        if(pattern.search(line)):
            t = line[line.index("Received") + 9:]
            t = t[:t.index(" ")]
            result.append(int(t))
    
    f.close()
    return result

def removeDouble(df, column, df_name, name):
    l = len(df)
    df = df.drop_duplicates(subset = column)
    if(l != len(df)):
        print("Deleted %i doubles for %s of %s" %((l - len(df)),df_name, name))
    return df

In [None]:
def loadData(path):
    
    names = []
    produced_generated = []
    produced_recieved = []
    filtered = []
    warnings = []
    modelchange = []
    firstTimestamp = []
    producerRun = []
    kafkaBatches = []
    fullJoins = []
    transitions = {}

    producerByRun = []
    for r in runs:
        pathHelper = path + r + str(os.path.sep) + "data" + str(os.path.sep)
        
        producer = os.listdir(pathHelper)
        
        for p in producer:
            kafkaBatches.append(getBatches(path + r + str(os.path.sep) + "logs" + str(os.path.sep) + p + str(os.path.sep) + "log.log"))
        
        producer = [pathHelper + x + str(os.path.sep) for x in producer]
        producerByRun.append(producer)
        transitions[r] =  transitionsFromFile(path + ".." + str(os.path.sep) + "runs" + str(os.path.sep) + "exp" + str(experiment) + "-" + r + ".log")

        for prod in tqdm(producer):
            
            try:
            
                time = os.listdir(prod)[0]
                topic = list(filter(lambda x : len(x) == 6, os.listdir(prod + time + str(os.path.sep))))[0][:-4]
                name = "run_" + r + "_" + topic
                
                dataPath = prod  + str(os.path.sep) + time + str(os.path.sep) + topic

                produced_generated_file = open(dataPath + "_produced.csv")
                produced_generated_file_out = open(dataPath + "_tmp.csv", "w")
                
                errors = 0
                for line in produced_generated_file:
                    if(line.startswith("org.apache.kafka.common.errors.TimeoutException")):
                        errors += 1
                        pass
                    elif(line.startswith("org.apache.kafka.common")):
                        errors += 1
                        print("Got %s for %s" %(line, prod))
                    else:
                        produced_generated_file_out.write(line)
                if(errors > 0):
                    print("Got %i errors for %s" %(errors, prod))
                
                produced_generated_file.close()
                produced_generated_file_out.close()
                
                produced_generated_df = pandas.read_csv(dataPath + "_tmp.csv")
                
                os.remove(dataPath + "_tmp.csv") 
                
                produced_generated_df = removeDouble(produced_generated_df,"Kafka.Offset", "produced_generated", name)

                produced_recieved_df = pandas.read_csv(dataPath + ".csv")
                produced_recieved_df = removeDouble(produced_recieved_df,"Kafka.Offset", "produced_recieved", name)

                produced_generated_df = filterLastData(produced_generated_df, produced_recieved_df)

                filtered_df = pandas.read_csv(dataPath + "_filtered.csv")
                filtered_df = removeDouble(filtered_df,"Kafka.Offset", "filtered", name)
                warnings_df = pandas.read_csv(dataPath + "_warnings.csv")
                warnings_df = removeDouble(warnings_df,"Kafka.Offset", "warnings", name)
                modelchange_df = pandas.read_csv(dataPath + "_modelchange.csv")     

                #remove first 60100 elements ( around 5 minutes)

                modelchange_df = modelchange_df[(modelchange_df["producedElements"] > 60101)]

                producedFilter = produced_generated_df["ProducedElements"] <= 60100 # last point of anomaly

                produced_generated_with_measurement = produced_generated_df.set_index("Kafka.Offset").join(produced_recieved_df.set_index("Kafka.Offset")["Data.Measurement"]).set_index("Data.Measurement")\
                        .join(produced_recieved_df.set_index("Data.Measurement").add_prefix("pr_"))
                
                produced_generated_with_measurement = produced_generated_with_measurement[(produced_generated_with_measurement["Producer.Timestamp"]-produced_generated_with_measurement["pr_Data.Timestamp"]).abs()<1]
                produced_generated_with_measurement = produced_generated_with_measurement[produced_generated_with_measurement["ProducedElements"] <= 60100]
                
                offsets = produced_generated_with_measurement["pr_Kafka.Offset"].to_numpy()
                produced_generated_df = produced_generated_df[np.logical_not(producedFilter)]

                producedReceivedFilter = np.isin(produced_recieved_df["Kafka.Offset"].to_numpy(), offsets)
                produced_recieved_df = produced_recieved_df[np.logical_not(producedReceivedFilter)]

                filteredFilter = np.isin(filtered_df["Data.Offset"].to_numpy(), offsets)
                offsets = filtered_df["Kafka.Offset"][filteredFilter].to_numpy()
                filtered_df = filtered_df[np.logical_not(filteredFilter)]

                warningFilter = np.isin(warnings_df["Record.BeginOffset"], offsets)
                warnings_df = warnings_df[np.logical_not(warningFilter)]

                firstTimestampValue = produced_generated_df.iloc()[0][3]
                
                produced_generated_with_measurement = produced_generated_df.set_index("Kafka.Offset").join(produced_recieved_df.set_index("Kafka.Offset")["Data.Measurement"]).reset_index()
                fullJoin = produced_generated_with_measurement.set_index("Data.Measurement").add_prefix("pg_")\
                    .join(produced_recieved_df.set_index("Data.Measurement").add_prefix("pr_"))
                fullJoin.index.name = "Data.Measurement"

                fullJoin = fullJoin\
                    .reset_index()\
                    .set_index("pr_Kafka.Offset")\
                    .join(filtered_df.add_prefix("fi_").set_index("fi_Data.Offset"), how = "outer")
                fullJoin.index.name = "pr_Kafka.Offset / fi_Data.Offset"

                fullJoin = fullJoin\
                    .reset_index()\
                    .set_index("fi_Kafka.Offset")\
                    .join(warnings_df.add_prefix("wa_").set_index("wa_Record.BeginOffset"), how = "outer")
                fullJoin.index.name = "fi_Kafka.Offset / wa_Record.BeginOffset"
                fullJoin = fullJoin.reset_index()

                diff = (fullJoin["pr_Data.Timestamp"] - fullJoin["pg_Producer.Timestamp"]).to_numpy()
                fjFilter = np.abs(diff) > 0 | np.isnan(np.array(diff))
                fullJoin = fullJoin[np.logical_not(fjFilter)]
    
                producerRun.append(r)
                produced_generated.append(produced_generated_df)
                produced_recieved.append(produced_recieved_df)
                filtered.append(filtered_df)
                warnings.append(warnings_df)
                modelchange.append(modelchange_df)
                firstTimestamp.append(firstTimestampValue)
                fullJoins.append(fullJoin)

                names.append(name)
            except:
                traceback.print_exc(file=sys.stdout)
                print("error",prod)
                pass

    return names, produced_generated, produced_recieved, filtered, warnings, modelchange, firstTimestamp, transitions, producerRun, kafkaBatches, fullJoins

In [None]:
#load data

names, produced_generated, produced_recieved, filtered, warnings, modelchange, firstTimestamp, transitions, producerRun, kafkaBatches, fullJoins = loadData(path)

In [None]:
def addToData(data, topic, current, seconds, absTime):
    #empty
    if not current:
        return data
    
    if topic in data:
        cd = data[topic]
        if cd["data"][-1] != current:
            cd["data"].append(current)
            cd["time"].append(seconds)
            cd["abstime"].append(absTime)
    else:
        data[topic] = {
            "data" : [current],
            "time" : [seconds],
            "abstime" : [absTime]
        }
    
    return data

def showDiff(data, transition):
    keys = list(data.keys())
    keys.sort() 
    for t in keys:
        print(t)
        print(data[t]["data"][0]) 
        if len(data[t]["data"]) > 1:
            ld = data[t]["data"][0]
            for x in range(1, len(data[t]["data"])):
                cd = data[t]["data"][x]  
                print("\x1b[31m" + str(list(dictdiffer.diff(ld,cd))) + "\x1b[0m")
                print("Minute %.1f %s" %(data[t]["time"][x] / 60, datetime.fromtimestamp(data[t]["abstime"][x] / 1000)))
                ltr = None
                for tr in transition:
                    if(tr[1] > data[t]["abstime"][x]):
                        break
                    ltr = tr
                if ltr is not None and ltr[2] <= data[t]["abstime"][x]:
                    print(ltr)
                    print("%.2f s after %s / %.2f s after start" %((data[t]["abstime"][x] - ltr[2]) / 1000,ltr[0], (data[t]["abstime"][x] - ltr[1]) / 1000))
                ld = cd
        print()
        
def replaceKafkaWithNode(c, pods, second):
    result = []
    for x in c:
        if(x == -1):
            result.append("No")
        else:
            d = pods["kafka-" + str(x)]
            index = 0
            for y in range(len(d["time"])):
                if(d["time"][y] > second):
                    break
                index = y
            result.append(d["data"][index]["server"])
    return set(result)

def extractResponsibilities(lines, transition):
    
    trial = 0
    seconds = 0
    topic = None
    partitions = None
    kafka = True
    
    data = {}
    pods = {}
    
    current = {}
    
    for line in lines:
        split = list(filter(lambda x : len(x) > 0, line.split(" ")))
        if(line.startswith("Trial ")):
            kafka = True
            trial = int(split[1])
            seconds = int(split[3])
            absTime = int(split[5]) * 1000
            current = {}
        elif(line.startswith("  topic \"")):
            data = addToData(data, topic, current, seconds, absTime)
            topic = split[1][1:-1]
            partitions = int(split[3])
            current = {}
        elif(line.startswith("    partition ")):
            partition = int(split[1][:-1]) 
            leader =  int(split[3][:-1]) 
            replicas = set([int(x) for x in split[5][:-1].split(",")])
            isrs = set([int(x) for x in split[7][:-1].split(",")])
            current[partition] = {
                "leader" : leader,
                "replicas" : replicas,
                "isrs" : isrs
            }
        elif(line.startswith("pod \"debug")):
            data = addToData(data, topic, current, seconds, absTime)
        elif(line.startswith("NAME")):
            kafka = False
        elif(not kafka):
            if(line.startswith(("analyst", "filter", "kafka", "producer", "zoo"))):
                pod = split[0]
                state = split[1]
                server = split[2][:-1]
                
                current = {
                    "state" : state,
                    "server": server
                }
                
                if pod in pods:
                    if pods[pod]["data"][-1] != current:
                        pods[pod]["data"].append(current)
                        pods[pod]["time"].append(seconds)
                        pods[pod]["abstime"].append(absTime)
                else:
                    pods[pod] = {
                        "data" : [current],
                        "time" : [seconds],
                        "abstime" : [absTime]
                    }
                
            else:
                if(line.startswith(("debug","Error from server: rpc error: code = Unavailable desc = transport is closing","The connection to the server ", "Error from server: etcdserver: request timed out"))):
                    pass
                else:
                    print("Error", line)

    for t in data:
        for i in range(len(data[t]["data"])):
            c = data[t]["data"][i]
            time = data[t]["time"][i]
            for partition in c:
                
                c[partition]["replicas"] = replaceKafkaWithNode(c[partition]["replicas"], pods, time)
                c[partition]["isrs"] = replaceKafkaWithNode(c[partition]["isrs"], pods, time)
                c[partition]["leader"] = list(replaceKafkaWithNode([c[partition]["leader"]], pods, 0))[0]
               
    showDiff(data, transition)
    
    for i in range(5):
        print("====================================")
        
    showDiff(pods, transition)
    
    for i in range(5):
        print("====================================")
        
    return data, pods

topicAssignments = {}
podsAssignments = {}

for run in runs:    
    print("\x1b[34mRUN:" + str(run) + "\x1b[0m")
    p = path + run + os.path.sep + "logs" + os.path.sep + "responsibilities.log"
    file = open(p, 'r') 
    lines = file.readlines()
    file.close()
    data, pods = extractResponsibilities(lines, transitions[run])
    topicAssignments[run] = data
    podsAssignments[run] = pods

### Helper

In [None]:
rawDataLeaderChange = []
rawDataReplicaDown = []

filterLeaderChange = []
filterReplicaDown = []

filterServiceHostChange = []
analystServiceHostChange = []

def leaderChange(data):
    if len(data['time']) > 1:
        leader = data['data'][0][0]['leader']
        for x in data['data']:
            if(x[0]['leader'] != leader):
                return True
    return False

def replicaDown(data):
    if(leaderChange(data)):
        return False
    if len(data['time']) > 1:
        isrs = len(data['data'][0][0]['isrs'])
        for x in data['data']:
            if(len(x[0]['isrs']) != isrs):
                return True
    return False

def podChangeHost(data):
    if len(data['time']) > 1:
        server = data['data'][0]['server']
        for x in data['data']:
            if(x['server'] != server):
                return True
    return False

for x in topicAssignments:
    topicAssignment = topicAssignments[x]
    podAssignment = podsAssignments[x] 
    #dont look at t7 for exp 1
    for i in range(1,8 if experiment != 1 else 7):
        topicName = "t" + str(i)
        runName = names.index("run_" + x + "_" + topicName)
        #leader change of tX
        
        rawDataLeaderChangeForTopic = leaderChange(topicAssignment[topicName])
        rawDataReplicaDownForTopic = replicaDown(topicAssignment[topicName])
        
        rawdataFine = not (rawDataLeaderChangeForTopic or rawDataReplicaDownForTopic)
        
        filterDataLeaderChangeForTopic = leaderChange(topicAssignment[topicName+"_filtered"])
        filterDataReplicaDownForTopic = replicaDown(topicAssignment[topicName+"_filtered"])
        filterServiceDown = podChangeHost(podAssignment["filter-" + str(i - 1)])
        
        warningDataLeaderChangeForTopic = leaderChange(topicAssignment[topicName+"_warnings"])
        warningDataReplicaDownForTopic = replicaDown(topicAssignment[topicName+"_warnings"])
        analystServiceDown = podChangeHost(podAssignment["analyst-" + str(i - 1)])
        
        if rawDataLeaderChangeForTopic:
            rawDataLeaderChange.append(runName)
        if rawDataReplicaDownForTopic:
            rawDataReplicaDown.append(runName)
            
        if(rawdataFine):
            if(not filterServiceDown):
                if(filterDataLeaderChangeForTopic):
                    filterLeaderChange.append(runName)
                elif(filterDataReplicaDownForTopic):
                    filterReplicaDown.append(runName)
                else:
                    #until warning service everything fine
                    if(not warningDataLeaderChangeForTopic and not warningDataReplicaDownForTopic and analystServiceDown):
                        analystServiceHostChange.append(runName)
            else:
                if(not (filterDataLeaderChangeForTopic or filterDataReplicaDownForTopic)):
                    filterServiceHostChange.append(runName)     
                    
#Remove t7 in experiment 1
allIndexes = []
for x in names:
    append = True
    if(experiment == 1):
        if(x[-1] == '7'):
            append = False
    if(append):
        allIndexes.append(names.index(x))
        
        
filterDown = []
analystDown = []
for x in topicAssignments:
    podAssignment = podsAssignments[x] 
    for i in range(1,8):
        topicName = "t" + str(i)
        runName = "run_" + x + "_" + topicName
        if podChangeHost(podAssignment["filter-" + str(i - 1)]):
            filterDown.append(runName)
        if podChangeHost(podAssignment["analyst-" + str(i - 1)]):
            analystDown.append(runName)
print("Filter service was put on another Host", filterDown)
print("Analyst service was put on another Host", analystDown)

runFor = [range(len(names))]
additionalText = ["All"]



if(experiment == 1):
    runFor.append(allIndexes)
    additionalText.append("NotT7")

if(experiment != 0):
    if(len(rawDataLeaderChange) > 0):
        runFor.append(rawDataLeaderChange)
        additionalText.append("RawDataLeaderChange")
    if(len(rawDataReplicaDown) > 0):
        runFor.append(rawDataReplicaDown)
        additionalText.append("RawDataReplicaDown")
    if(len(filterLeaderChange) > 0):
        runFor.append(filterLeaderChange)
        additionalText.append("FilterLeaderChangen")
    if(len(filterReplicaDown) > 0):
        runFor.append(filterReplicaDown)
        additionalText.append("FilterReplicaDown")
    if(len(filterServiceHostChange) > 0):
        runFor.append(filterServiceHostChange)
        additionalText.append("FilterServiceHostChange")
    if(len(analystServiceHostChange) > 0):
        runFor.append(analystServiceHostChange)
        additionalText.append("AnalystServiceHostChange")
    if(len(filterDown) > 0):
        runFor.append([names.index(x) for x in filterDown])
        additionalText.append("FilterServiceHostChangeWithInfluence")
    if(len(analystDown) > 0):
        runFor.append([names.index(x) for x in analystDown])
        additionalText.append("AnalystServiceHostChangeWithInfluence")

In [None]:
from datetime import datetime

for k in transitions:
    lastStart = 0
    for t in transitions[k]:
        if(lastStart != 0):
            print((t[2] - lastStart) / 1000)
        lastStart = t[1]
        print("run %s state %s started %s until %s" %(k,t[0], datetime.fromtimestamp(t[1] / 1000), datetime.fromtimestamp(t[2] / 1000)))
    print()

In [None]:
linewidth = .8

def transformTimestamp(timestamps, firstTimestamp):
    return (timestamps - firstTimestamp) / 60000

def plotDistributionOverTime(data, timestamps, path, firstTimestamp, transitions = None, xlim=None,ylim=None):
    if(transitions is not None):
        labeled = False
        for t in transitions:
            if t[1] > firstTimestamp and t[1] < timestamps[-1]:
                plt.axvspan(transformTimestamp(t[1],firstTimestamp), transformTimestamp(t[2],firstTimestamp), facecolor='r', alpha=0.2, label = "MockFog's transition phase" if not labeled else None)  
                labeled = True
                
        if(labeled):
            plt.legend()

    plt.plot(transformTimestamp(timestamps,firstTimestamp), data,linewidth=linewidth)
    plt.xlabel("runtime of the experiment in min")
    plt.ylabel("duration in ms")
    plt.xlim(xlim)
    plt.ylim(ylim)
    if(path is not None):
        plt.savefig(path + "_over_time.pdf")
        plt.savefig(path + "_over_time.jpg", dpi = 300)
    else:
        plt.show()
    plt.close()
    
    
def plotDistributionCumulative(data, path = None):    
    fig, ax = plt.subplots()
    sorted = np.sort(data)
    plt.xscale("log")

    plt.plot(sorted,np.linspace(0, 1,len(sorted),endpoint=True),linewidth=linewidth)
    plt.xlabel("duration in ms")
    plt.ylabel("cumulative frequency")
    
    
    ax.xaxis.set_major_formatter(StrMethodFormatter('{x:.0f}'))
    ax.xaxis.set_minor_formatter(NullFormatter())

    if(path is not None):
        plt.savefig(path + "_dist.pdf")
        plt.savefig(path + "_dist.jpg", dpi = 300)
        plt.close()

In [None]:
def extractAvgMedStdMinMaxFromArray(diff, timestamps, path, name, firstTimestamp, transitions, plot = True):
    if(plot):
        plotDistributionOverTime(diff, timestamps, path + name, firstTimestamp, transitions)
        plotDistributionCumulative(diff, path + name)
    
    avg, med, std, minimum, maximum, per90, per95, per99, per99 = (np.average(diff), np.median(diff), np.std(diff), np.min(diff), np.max(diff), np.percentile(diff, 90), np.percentile(diff, 95), np.percentile(diff, 99), np.percentile(diff, 99.9))
    datas = [str(x) for x in [name,avg, med, std, minimum, maximum, per90, per95, per99, per99]]
    if(plot):
        file_object = open(path + 'values.csv', 'a')
        file_object.write(";".join(datas) + '\n')
        file_object.close()
    return diff
    #return "avg = %.2f ms; median %.2f ms; std %.2f ms; min %d ms; max %d ms; 90%% %.2f ms; 95%% %.2f ms; 99%% %.2f ms; 99.9%% %.2f ms"\
    #    %(**datas)
    
def extractAvgMedStdMinMaxFromListOfArray(inputList,path, unit = "ms", plot = True, return_datas = False):
    diff = np.concatenate(inputList, axis = 0)
    
    if(plot):
        plotDistributionCumulative(diff, path = path)
    
    datas = (np.average(diff), np.median(diff), np.std(diff), np.min(diff), np.max(diff), np.percentile(diff, 90), np.percentile(diff, 95), np.percentile(diff, 99), np.percentile(diff, 99.9))
    text = "avg = %.2f " + unit + "; median %.2f " + unit + "; std %.2f " + unit + "; min %d " + unit + "; max %d " + unit + "; 90%% %.2f " + unit + "; 95%% %.2f " + unit + "; 99%% %.2f " + unit + "; 99.9%% %.2f " + unit
    print(text %(datas))
    if(return_datas):
        return datas
    return diff

In [None]:
def prepareDictory(path):
    shutil.rmtree(path,ignore_errors=True)
    os.makedirs(path, exist_ok=True)
    file_object = open(path + 'values.csv', 'w')
    file_object.write(";".join(["name", "avg", "med", "std", "minimum", "maximum", "per90", "per95", "per99", "per99"]) + '\n')
    file_object.close()

### Histogram of generated data over time

consumer & produducer timestamp of the produced data

data isn't produced every 5ms instead there are peaks and lows, just the avg is 5ms

In [None]:
def plotHist(data, label, firstTimestamp, transitions, width = 250, slabel = False):
    timespan = data[-1] - data[0]
    
    if(transitions is not None):
        labeled = False
        for t in transitions:
            if t[1] > firstTimestamp and t[1] < data[-1]:
                plt.axvspan(transformTimestamp(t[1],firstTimestamp) * 60, transformTimestamp(t[2],firstTimestamp) * 60, facecolor='r', alpha=0.2, label = "MockFog's transition phase" if not labeled else None)    
                labeled = True
    
    plt.hist((data - firstTimestamp) / 1000, bins = int(timespan / width),zorder=2, label = label)
    plt.hlines(width / 5, 0, (data[-1] - firstTimestamp) / 1000, color = "green", zorder=3, label = "expected amount of points" if slabel else None)
    #plt.title("Gaussian Histogram")
    plt.xlabel("runtime of the experiment in s")
    plt.ylabel("data points in an 250 ms intervall")
    plt.legend()

for x in nrange(len(names)):
    try:
        producedTime = produced_generated[x].to_numpy()[:,3]
        #print("Produced data distribution")
        plotHist(producedTime, "produced data points", firstTimestamp[x], transitions[producerRun[x]], slabel = True) 
        #print("Recieved produced data distribution")
        plotHist(produced_recieved[x].to_numpy()[:,0], "recieved data points", firstTimestamp[x], None)
        p = output + "produced_hist" + os.path.sep
        os.makedirs(p, exist_ok=True)
        p += names[x]
        plt.savefig(p + ".pdf")
        plt.savefig(p + ".jpg", dpi = 300)
        plt.close()
    except Exception as e:
        plt.close()
        print(e)
        print("Error", names[x], x)
        



In [None]:
def kafkaAggCount(dataList, index, label, path):
    print("How often aggregates Kafka data for", label)
    counts = np.array([item for sublist in dataList for item in sublist])
    counts = counts[counts > 0]
    a,b = np.unique(counts, return_counts=True)
    countsSum = np.sum(b)
    #print(a,b)
    print("Modus: %d with %.2f%%" %(a[np.where(b == np.max(b))][0], (np.max(b) / countsSum) * 100))
    print("One element with %.2f%%" %((b[np.where(a == 1)][0] / countsSum) * 100))
    print("<= 5 elements %.2f%%" %((np.sum(b[np.where(a <= 5)]) / countsSum) * 100))
    extractAvgMedStdMinMaxFromListOfArray([counts], path, "r/s")

#only look at unbroken producers
data = [kafkaBatches[x] for x in allIndexes]
kafkaAggCount(data, 0, "produced_recieved", output + "produced_records_sub")

### Information on the time difference between the arrival time of records for all 3 topics

peaks caused by cpu time, it is not regulary produced

In [None]:
def avgMedStdArrivalTime(df, column, path, name, firstTimestamp, transitions, filterArray = None):
    df = df.to_numpy()[:,column].astype(np.int64)
    diff = df[1:,]-df[:-1,]
    timestamps = df[1:,]
    if filterArray is not None:
        diff = diff[filterArray]
        timestamps = timestamps[filterArray]
    diff = extractAvgMedStdMinMaxFromArray(diff, timestamps, path, name, firstTimestamp, transitions)
    return diff
    

currentPath = output + "arrivalTimesOfData" + os.path.sep
shutil.rmtree(currentPath,ignore_errors=True)
pathesWithoutSep = [currentPath + "produced_generated", currentPath + "produced_received", currentPath + "filtered", currentPath + "warnings"] 
pathes = [x + os.path.sep for x in pathesWithoutSep]
for x in pathes:
    prepareDictory(x)
    
pro_gen = []
pro_rec = []
fil = []
war = []
    
for x in nrange(len(names)):
    try:
        pro_gen.append(avgMedStdArrivalTime(produced_generated[x], 3,pathes[0] + os.path.sep, names[x], firstTimestamp[x], transitions[producerRun[x]]))
        pro_rec.append(avgMedStdArrivalTime(produced_recieved[x], 0, pathes[1] + os.path.sep, names[x], firstTimestamp[x], transitions[producerRun[x]]))
        fil.append(avgMedStdArrivalTime(filtered[x], 0, pathes[2] + os.path.sep, names[x], firstTimestamp[x], transitions[producerRun[x]]))
        offsetsAtModelChanges = modelchange[x].set_index("producedElements").join(fullJoins[x].set_index("pg_ProducedElements"))["fi_Kafka.Offset / wa_Record.BeginOffset"].to_numpy()
        warningFilter = np.logical_not(np.isin(warnings[x]["Record.BeginOffset"].to_numpy(), offsetsAtModelChanges)[1:])
        print(np.sum(np.isin(warnings[x]["Record.BeginOffset"].to_numpy(), offsetsAtModelChanges)[1:]))
        war.append(avgMedStdArrivalTime(warnings[x], 0, pathes[3] + os.path.sep, names[x], firstTimestamp[x], transitions[producerRun[x]], warningFilter))
    except Exception as e:
        plt.close()
        e.print_stack()
        print(e)
        print("Error", names[x], x)

print("Arrival diff of generated data")
diff = extractAvgMedStdMinMaxFromListOfArray(pro_gen, pathesWithoutSep[0])
print("Percentage of 5ms %.2f" %((np.sum(diff == 5)/len(diff) * 100)))
print("Percentage of 0ms %.2f" %((np.sum(diff == 0)/len(diff) * 100)))
print("Percentage of <=10ms %.2f" %((np.sum(diff <= 10)/len(diff) * 100)))
print("Percentage of >50ms %.2f" %((np.sum(diff > 500)/len(diff) * 100)))
print()
print("Arrival diff of generated data from Kafka")
extractAvgMedStdMinMaxFromListOfArray(pro_rec, pathesWithoutSep[1])
print("Arrival diff of filtered data from Kafka")
extractAvgMedStdMinMaxFromListOfArray(fil, pathesWithoutSep[2])
print("Arrival diff of warning data from Kafka")
_ = extractAvgMedStdMinMaxFromListOfArray(war, pathesWithoutSep[3])

In [None]:
#extract values for fitler service down
print("time when the last value was send")
for x in filterDown:
    consumerTime = filtered[names.index(x)].to_numpy()[:,0].astype(np.int64)
    diff = consumerTime[1:,]-consumerTime[:-1,]
    index = np.where(diff == np.max(diff))[0][0]
    print(x, consumerTime[index])
    s = x.split("_")
    data = podsAssignments[s[1]]["filter-" + str(int(s[2][-1]) - 1)]
    firstServer = data["data"][0]["server"]
    for s in range(len(data)):
        if(data["data"][s]["server"] != firstServer):
            print(x, consumerTime[index] - data["abstime"][s])
            break


## Check Produced

### How long does it take until a produced record is acknowledged by kafka

In [None]:
def kafkaAck(df, path, name, firstTimestamp, transitions):
    ack = df.to_numpy()[:,2].astype(np.int64)
    send = df.to_numpy()[:,3].astype(np.int64)
    diff = ack - send
    diff = extractAvgMedStdMinMaxFromArray(diff, send, path, name, firstTimestamp, transitions)
    return diff


currentPath = output + "producedAckByKafka" + os.path.sep
prepareDictory(currentPath + "data" + os.path.sep)

def extractKafkaAck(values, plot = True):
    kaf = []

    for x in tqdm(values):
        try:
            kaf.append(kafkaAck(produced_generated[x], currentPath + "data" + os.path.sep, names[x], firstTimestamp[x], transitions[producerRun[x]]))
        except Exception as e:
            plt.close()
            print(e)
            print("Error", names[x], x)
    return kaf

diffAckAll = extractKafkaAck(range(len(names)))

for x in range(len(runFor)):
    diffAck = [diffAckAll[i] for i in runFor[x]]
    print("Produced acknowledged by Kafka for", additionalText[x])
    _ = extractAvgMedStdMinMaxFromListOfArray(diffAck, currentPath + additionalText[x])

### check validity of produced data

In [None]:
import sys

def genError(text, last, now, result, columns, filterValues, transition):
    errorIndex = np.where(np.logical_not(result))
    formatter = {'float_kind':lambda x: "%i" % x}
    a = now[filterValues][errorIndex]
    b = last[filterValues][errorIndex]
    
    for t in transition:
        if np.all((a >= t[1]) & (a <= t[2])):
            print("Now was doing state %s" %t[0])
        if np.all((b >= t[1]) & (b <= t[2])):
            print("Last was doing state %s" %t[0])
        
    a = np.array2string(a, precision = 0, formatter=formatter)
    b = np.array2string(b, precision = 0, formatter=formatter)
    
    diff = now[filterValues][errorIndex] - last[filterValues][errorIndex]
    text = text %(a ,b , str(columns[filterValues][errorIndex]), diff)
    return text

def validate(df, higherFields, higherOrEqualFields, offset, transition, printOut = True):
    
    columns = np.array(df.columns)
    df = df.to_numpy()#[:,0:4]
    
    lastRow = df[0]
    error = False
    
    higherA = 0
    higherEqualA = 0
    offsetStepA = 0
    
    for x in df[1:]:
            
        if(np.sum(x[higherFields] > lastRow[higherFields]) != len(higherFields)):
            if(printOut):
                print(genError("Now %s was lower or equal than last %s for field %s, diff was %s", lastRow, x, x[higherFields] > lastRow[higherFields], columns, higherFields, transition))
            error  = True           
            higherA += 1
        if(np.sum(x[higherOrEqualFields] >= lastRow[higherOrEqualFields]) != len(higherOrEqualFields)):
            if(printOut):
                print(genError("Now %s was lower than last %s for field %s, diff was %s", lastRow, x, x[higherOrEqualFields] >= lastRow[higherOrEqualFields], columns, higherOrEqualFields, transition))
            error  = True  
            higherEqualA += 1
        if(not np.sum(lastRow[offset] + 1 == x[offset]) == len(offset)):
            errorFilter = np.logical_not(lastRow[offset] + 1 == x[offset])
            if(printOut):
                print(genError("Now %s is not increases by 1 last %s for field %s, diff was %s", lastRow, x, lastRow[offset] + 1 == x[offset], columns, offset, transition))
            error = True
            offsetStepA += 1
        lastRow = x
    
    if error:
        print("higher", higherA, "higher or equal", higherEqualA, "offset step not 1", offsetStepA)
    
    return (not error, higherA, higherEqualA, offsetStepA)


higher = 0
higherEqual = 0
offsetStep = 0

for x in nrange(len(names)):
    t = transitions[producerRun[x]]
    r = validate(produced_recieved[x],[],[0,1,3],[2], t)
    higher += r[1]
    higherEqual += r[2]
    offsetStep += r[3]
    if not r[0]:
        print("Not valid produced", names[x])
        print()
    r = validate(filtered[x],[],[0,1,4],[2,3], t)
    higher += r[1]
    higherEqual += r[2]
    offsetStep += r[3]
    if not r[0]:
        print("Not valid filtered", names[x])
        print()
    r = validate(warnings[x],[4,5],[0,1,6,7],[2], t)
    higher += r[1]
    higherEqual += r[2]
    offsetStep += r[3]
    if not r[0]:
        print("Not valid warnings", names[x])
        print()
        
print("higher", higher, "higher or equal", higherEqual, "offset step not 1", offsetStep)

### Time between the last step and the next

Negative values are possible, a node can receive the warnings before the filtered, it depends on the position in cluster an other aspects

In [None]:
pathHelper = output + "receivedByFollowingTopic" + os.path.sep

currentPathFW = pathHelper + "FilteredRecWarning"

currentPathPW = pathHelper + "ProducedRecWarning"

currentPathPGW = pathHelper + "ProducedGenWarning"

currentPathPGF = pathHelper + "ProducedGenFiltered"

currentPathPRF = pathHelper + "ProducedRecFiltered"

currentPathPPR = pathHelper + "ProducedGenProdRec"

currentPathAll = pathHelper + "allTogether" + os.path.sep

def extractReceivedByFollowingTopic(elements):
    fw = []
    pw = []
    pgw = []
    prf = []
    pgf = []
    ppr = []

    removeLastX = 4

    for x in tqdm(elements):
        try:

            #Prod Gen -> Warning
            allJoin = fullJoins[x]\
            .sort_values(["wa_Consumer.Time", "pg_ProducedElements"], ascending = True)\
            .drop_duplicates(subset = ["pg_ProducedElements"])
            
            allJoin = allJoin[pandas.notna(allJoin["Data.Measurement"])]

            allJoinPW = allJoin

            pgwjoin = allJoin[(allJoin["Data.Measurement"]>2.0) & pandas.notna(allJoin["wa_Consumer.Time"])]
            
            if(len(allJoin) != 179900):
                print(names[x], len(allJoin), "Not 179900 elements")

            diffPGW = (pgwjoin["wa_Consumer.Time"] - pgwjoin["pg_Producer.Timestamp"]).to_numpy()[:-removeLastX]
            timestampsPGW = pgwjoin["pg_Producer.Timestamp"].to_numpy()[:-removeLastX]
            
                        
            if(names[x][-1] == '7' and experiment == 1):
                plotDistributionOverTime(data=diffPGW,timestamps=timestampsPGW,path=currentPathPGW + os.path.sep + "data" + os.path.sep + names[x] + "zoom",firstTimestamp=firstTimestamp[x],xlim=(10,12),ylim=(0,20000),transitions=transitions[producerRun[x]])
            
            
            diffPGW = extractAvgMedStdMinMaxFromArray(diffPGW, timestampsPGW, currentPathPGW + os.path.sep + "data" + os.path.sep, names[x], firstTimestamp[x], transitions[producerRun[x]])
            pgw.append(diffPGW)

            #Prod Rec -> Warning
            diffPW = (pgwjoin["wa_Consumer.Time"] - pgwjoin["pr_Consumer.Time"]).to_numpy()[:-removeLastX]
            timestamps = pgwjoin["pr_Consumer.Time"].to_numpy()[:-removeLastX]
            diffPW = extractAvgMedStdMinMaxFromArray(diffPW, timestamps, currentPathPW + os.path.sep + "data" + os.path.sep, names[x], firstTimestamp[x], transitions[producerRun[x]])
            pw.append(diffPW)


            #Filter -> Warning
            allJoin = fullJoins[x]\
            .sort_values(["wa_Consumer.Time", "fi_Consumer.Time", "pg_ProducedElements"], ascending = True)\
            .drop_duplicates(subset = ["pg_ProducedElements"])
            allJoin = allJoin[(pandas.notna(allJoin["Data.Measurement"])) & (pandas.notna(allJoin["wa_Consumer.Time"]))]
            pgwjoin = allJoin[allJoin["Data.Measurement"]>2.0]

            diffFW = (pgwjoin["wa_Consumer.Time"] - pgwjoin["fi_Consumer.Time"]).to_numpy()[:-removeLastX]
            timestamps = pgwjoin["fi_Consumer.Time"].to_numpy()[:-removeLastX]

            diffFW = extractAvgMedStdMinMaxFromArray(diffFW, timestamps, currentPathFW + os.path.sep + "data" + os.path.sep, names[x], firstTimestamp[x], transitions[producerRun[x]])
            fw.append(diffFW)

            #Prod Gen -> Filter

            allJoin = fullJoins[x]\
            .sort_values(["fi_Consumer.Time", "pg_ProducedElements"], ascending = True)\
            .drop_duplicates(subset = ["pg_ProducedElements"])
            allJoin = allJoin[(pandas.notna(allJoin["Data.Measurement"])) & (pandas.notna(allJoin["fi_Consumer.Time"]))]

            diffPGF = (allJoin["fi_Consumer.Time"] - allJoin["pg_Producer.Timestamp"]).to_numpy()[:-removeLastX]
            timestampsPGF = allJoin["pg_Producer.Timestamp"].to_numpy()[:-removeLastX]
            
                        
            if(names[x][-1] == '7' and experiment == 1):
                plotDistributionOverTime(data=diffPGF,timestamps=timestampsPGF,path=currentPathPGF + os.path.sep + "data" + os.path.sep + names[x] + "zoom",firstTimestamp=firstTimestamp[x],xlim=(10,12),ylim=(0,20000),transitions=transitions[producerRun[x]])
            
            diffPGF = extractAvgMedStdMinMaxFromArray(diffPGF, timestampsPGF, currentPathPGF + os.path.sep + "data" + os.path.sep, names[x], firstTimestamp[x], transitions[producerRun[x]])
            pgf.append(diffPGF)
            
            
            #Prod Rec -> Filter

            allJoin = fullJoins[x]\
            .sort_values(["fi_Consumer.Time", "pr_Consumer.Time"], ascending = True)\
            .drop_duplicates(subset = ["pg_ProducedElements"])
            allJoin = allJoin[(pandas.notna(allJoin["Data.Measurement"])) & (pandas.notna(allJoin["fi_Consumer.Time"]))]

            diffPRF = (allJoin["fi_Consumer.Time"] - allJoin["pr_Consumer.Time"]).to_numpy()[:-removeLastX]
            timestampsPRF = allJoin["pr_Consumer.Time"].to_numpy()[:-removeLastX]
            
                        
            if(names[x][-1] == '7' and experiment == 1):
                plotDistributionOverTime(data=diffPRF,timestamps=timestampsPRF,path=currentPathPRF + os.path.sep + "data" + os.path.sep + names[x] + "zoom",firstTimestamp=firstTimestamp[x],xlim=(10,12),ylim=(0,20000),transitions=transitions[producerRun[x]])
            
            diffPRF = extractAvgMedStdMinMaxFromArray(diffPRF, timestampsPRF, currentPathPRF + os.path.sep + "data" + os.path.sep, names[x], firstTimestamp[x], transitions[producerRun[x]])
            prf.append(diffPRF)

            #Prod Gen -> Prod Rec

            allJoin = fullJoins[x]\
            .sort_values(["pr_Consumer.Time", "pg_ProducedElements"], ascending = True)\
            .drop_duplicates(subset = ["pg_ProducedElements"])

            diffPPR = (allJoin["pr_Consumer.Time"] - allJoin["pg_Producer.Timestamp"]).to_numpy()[:-removeLastX]
            timestampsPPR = allJoin["pg_Producer.Timestamp"].to_numpy()[:-removeLastX]
            
                        
            if(names[x][-1] == '7' and experiment == 1):
                plotDistributionOverTime(data=diffPPR,timestamps=timestampsPPR,path=currentPathPPR + os.path.sep + "data" + os.path.sep + names[x] + "zoom",firstTimestamp=firstTimestamp[x],xlim=(10,12),ylim=(0,20000),transitions=transitions[producerRun[x]])
            
            
            diffPPR = extractAvgMedStdMinMaxFromArray(diffPPR, timestampsPPR, currentPathPPR + os.path.sep + "data" + os.path.sep, names[x], firstTimestamp[x], transitions[producerRun[x]])
            ppr.append(diffPPR)


            for t in transitions[producerRun[x]]:
                if t[1] > firstTimestamp[x] and t[1] < timestamps[-1]:
                    plt.axvspan(t[1] - firstTimestamp[x], t[2] - firstTimestamp[x], facecolor='r', alpha=0.2) 

            plt.plot(timestampsPPR - firstTimestamp[x], diffPPR,label = "produced")
            plt.plot(timestampsPGF - firstTimestamp[x],diffPGF,label = "filtered")
            plt.plot(timestampsPGW - firstTimestamp[x],diffPGW,label = "warning")
            plt.legend()



            plt.savefig(currentPathAll + names[x] + ".jpg")
            plt.savefig(currentPathAll + names[x] + ".pdf")
            plt.close()

        except Exception as e:
            plt.close()
            print(e)
            print("Error", names[x], x)
        
    return fw, pw, pgw, prf, pgf, ppr

In [None]:
prepareDictory(currentPathFW + os.path.sep + "data" + os.path.sep)
prepareDictory(currentPathPW + os.path.sep + "data" + os.path.sep)
prepareDictory(currentPathPGW + os.path.sep + "data" + os.path.sep)
prepareDictory(currentPathPRF + os.path.sep + "data" + os.path.sep)
prepareDictory(currentPathPGF + os.path.sep + "data" + os.path.sep)
prepareDictory(currentPathPPR + os.path.sep + "data" + os.path.sep)
os.makedirs(currentPathAll, exist_ok=True)

#def runExtractReceivedByFollowingTopic(runFor, additionalText):
    
fwO, pwO, pgwO, prfO, pgfO, pprO = extractReceivedByFollowingTopic(runFor[0])


In [None]:
#print two together
def printTwoTogether(combineIds, namesForIds, store = True):

    indexes = [names.index(x) for x in combineIds]

    fig, ax = plt.subplots()

    plt.xscale("log")
    
    colors = ["b","g","r","c","y","b","m"]
    
    for x in range(len(indexes)):
        allJoin = fullJoins[indexes[x]]\
            .sort_values(["pr_Consumer.Time", "pg_ProducedElements"], ascending = True)\
            .drop_duplicates(subset = ["pg_ProducedElements"])

        diffPPR = (allJoin["pr_Consumer.Time"] - allJoin["pg_Producer.Timestamp"]).to_numpy()#[:-removeLastX]
        timestampsPPR = allJoin["pg_Producer.Timestamp"].to_numpy()#[:-removeLastX]
        sorted = np.sort(diffPPR)
        plt.plot(sorted,np.linspace(0, 1,len(sorted),endpoint=True), label = namesForIds[x], c = colors[int(names[indexes[x]][-1:]) - 1] if len(combineIds) > 4 else None)




    plt.xlabel("duration in ms")
    plt.ylabel("cumulative frequency")
    plt.legend()

    ax.xaxis.set_major_formatter(StrMethodFormatter('{x:.0f}'))
    ax.xaxis.set_minor_formatter(NullFormatter())

    cOutput = currentPathPPR + os.path.sep + "-".join(combineIds)
    if store:
        plt.savefig(cOutput + ".jpg", dpi = 300)
        plt.savefig(cOutput + ".pdf")
    plt.show()

#try:
#printTwoTogether(names, names, False)
printTwoTogether(["run_0_t3", "run_2_t3"], ["good placement (run 1)", "poor placement (run 3)"])
#except:
#    print("error")

In [None]:
for x in range(len(runFor)):

    fw = [fwO[i] for i in runFor[x]]
    pw = [pwO[i] for i in runFor[x]]
    pgw = [pgwO[i] for i in runFor[x]]
    prf = [prfO[i] for i in runFor[x]]
    pgf = [pgfO[i] for i in runFor[x]]
    ppr = [pprO[i] for i in runFor[x]]
    
    worstCase = 0

    print(additionalText[x])

    print("Produced -> Produced Received")
    d = extractAvgMedStdMinMaxFromListOfArray(ppr, currentPathPPR + os.path.sep + additionalText[x],return_datas = True)
    
    worstCase += d[4]
    
    diffPPR = np.concatenate(ppr, axis = 0)
    print("More than 100 ms: %.2f%%"%((np.sum(diffPPR > 100) / len(diffPPR)) * 100))
    print("More lower than 8 ms: %.2f%%"%((np.sum(diffPPR < 8) / len(diffPPR)) * 100))

    print("Produced Gen -> Filtered")
    _ = extractAvgMedStdMinMaxFromListOfArray(pgf, currentPathPGF + os.path.sep + additionalText[x])     
    
    print("Produced Rec -> Filtered")
    d = extractAvgMedStdMinMaxFromListOfArray(prf, currentPathPRF + os.path.sep + additionalText[x],return_datas = True)     
    worstCase += d[4]

    print("Filtered -> Warning")
    d = extractAvgMedStdMinMaxFromListOfArray(fw, currentPathFW + os.path.sep + additionalText[x],return_datas = True)
    worstCase += d[4]
    print("ProducedRec -> Warning")
    _ = extractAvgMedStdMinMaxFromListOfArray(pw, currentPathPW + os.path.sep + additionalText[x])
    print("ProducedGen -> Warning")
    _ = extractAvgMedStdMinMaxFromListOfArray(pgw, currentPathPGW + os.path.sep + additionalText[x])
    diff = np.concatenate(pgw, axis = 0)
    print("Amount of anomalies:", len(diff))
    print("More than 1000 ms: %.2f%%"%((np.sum(diff > 1000) / len(diff)) * 100))
    print("More than 2000 ms: %.4f%%"%((np.sum(diff > 2000) / len(diff)) * 100))
    print("More than 3000 ms: %.4f%%"%((np.sum(diff > 3000) / len(diff)) * 100))
    print("More than 5000 ms: %.4f%%"%((np.sum(diff > 5000) / len(diff)) * 100))
    
    print("The absolute worst case Prod Gen -> Warning Rec is %d ms" %worstCase)

    if(len(runFor) > 1):
        print("====================")

In [None]:
filterDown

In [None]:
def getMaxForEvents(indexes, times):
    
    if(len(indexes) == 0): 
        return

    print("found " + str(len(indexes)))
    maxList = []
    for i in indexes:
        p = times[i]
        x = np.where(p == np.max(p))[0][0]
        q = p[x:x+5*60*200]
        r = np.r_[0:x,x+5*60*200:len(p)]
        medianWithout = np.median(p[r])
        maxList.append(np.max(p) - medianWithout) 
        print(names[i], "maximum was",np.max(p), "over median", np.max(p) - medianWithout, "median", np.median(p),"median diff before", np.median(q), "median diff one missing", medianWithout - np.median(q))
    maxList = np.array(maxList)
    print("distribution min", np.min(maxList), "max", np.max(maxList), "median", np.median(maxList))
    print("----------------------------") 
    
print("max distribution if raw leader changes")
getMaxForEvents(rawDataLeaderChange, pprO)
print("max distribution if filter changes")
getMaxForEvents(filterLeaderChange, pgfO)

print("max distribution if raw replica changes")
getMaxForEvents(rawDataReplicaDown, pprO)
print("max distribution if filter replica changes")
getMaxForEvents(filterReplicaDown, pgfO)

print("dist of r2 t2")
getMaxForEvents([names.index("run_2_t2")], pgfO)

In [None]:
names


## Data loss produced to filter

In [None]:
def calcDataLoss(x,sortBy):
    #last data is sometimes no flushed
    filterJoin = pandas.notna(fullJoins[x]["pg_ProducedElements"]) & (fullJoins[x]["pg_ProducedElements"] < 239995)
    join = fullJoins[x][filterJoin]\
        .sort_values([sortBy, "pg_ProducedElements"], ascending = True)\
        .drop_duplicates(subset = ["pg_ProducedElements"])
    return len(join) - sum(pandas.notna(join[sortBy]))
    
for x in nrange(len(names)):
    try:
        # produced to filtered
        missing = calcDataLoss(x, "fi_Consumer.Time") 
        if missing > 0 :
            print("Error, %d missing" %missing, names[x])
        
    except Exception as e:
        plt.close()
        print(e)
        print("Error", names[x], x)
   

## Latency produced - warning

As the filter uses a median filter with size 5 it needs 3 outliers to detect a change. otherwise it is just skipped. ==> there must be a latency of at least (3*5ms) = 15ms

### Check all anomalies were detected

In [None]:
for x in nrange(len(names)):
    try:
        
        warningRequired = fullJoins[x][(fullJoins[x]["Data.Measurement"] > 1) & (fullJoins[x]["pg_ProducedElements"] < 240000)]
        warningRequired = warningRequired\
            .sort_values(["wa_Consumer.Time", "fi_Consumer.Time", "pg_ProducedElements"], ascending = True)\
            .drop_duplicates(subset = ["pg_ProducedElements"])
        notfound = sum(pandas.isnull(warningRequired["wa_Consumer.Time"]))
        warningsPossible = len(warningRequired)
        print("Found %d warnings" %(warningsPossible))
        if(notfound > 0):
            print("%s: For %d / %d anomalies there were no belonging warnings" %(names[x], notfound, warningsPossible))
        
    except Exception as e:
        plt.close()
        print(e)
        print("Error", names[x], x)

In [None]:
currentPath = output + "anomalyDetectionTime"
prepareDictory(currentPath + os.path.sep)

diffList = []

for x in nrange(len(names)):
    try:
        
        join = modelchange[x].set_index("producedElements").join(fullJoins[x].set_index("pg_ProducedElements"))
        join.index.name = "pg_ProducedElements"
        
        join = join.reset_index()\
            .sort_values(["wa_Consumer.Time", "fi_Consumer.Time", "pg_ProducedElements"], ascending = True)\
            .drop_duplicates(subset = ["pg_ProducedElements"])
        
        join = join[join["value"] == 10]
        diff = (join["wa_Consumer.Time"] - join["pg_Producer.Timestamp"]).to_numpy()

        timestamps = join["pg_Producer.Timestamp"].to_numpy()

        if (np.isnan(diff[-1])):
            diff = diff[:-1]
            timestamps = timestamps[:-1]

        diff = extractAvgMedStdMinMaxFromArray(diff, timestamps, currentPath + os.path.sep, names[x], firstTimestamp[x], transitions[producerRun[x]])
        diffList.append(diff)
        
    except Exception as e:
        plt.close()
        print(e)
        print("Error", names[x], x)

print("Time to detect an anomaly (initial value)")
diff = extractAvgMedStdMinMaxFromListOfArray(diffList, currentPath)
print("Amount of anomalies:", len(diff))

## Usage

In [None]:
def extractUsage(lines, server, run, transition):
    data = {}

    date = None
    sumMem = 0
    sumCPU = 0
    for l in lines:
        if l.startswith("Time "):
            if sumMem > 0:
                pid_data = data.get(-1, {
                    "mem": [],
                    "cpu": [],
                    "pid" : -1,
                    "running" : [],
                    "name" : "all",
                    "container" : "all"
                })
                pid_data["mem"].append(sumMem)
                pid_data["cpu"].append(sumCPU)
                pid_data["running"].append(date)
                
                data[-1] = pid_data
            sumMem = 0
            sumCPU = 0
            date = int(l[5:-1])
                
        else:
            e = l.split(' ');
            e = list(filter(lambda x: x != "", e))
            cpu = float(e[0])
            mem = float(e[1])
            sumCPU += cpu
            sumMem += mem
            pid = int(e[2])
            time = e[3].split(":")
            secondsRunning = int(time[0]) * 3600 + int(time[1]) * 60 + int(time[2])
            name = e[4]
            container = e[5]
            up = e[6]
            
            if(name.startswith("etcd")):
                pass
            else:
                split = name.split("_")
                
                if split[1] == "POD":
                    continue
                name = split[1] + "-" +  split[2]
            
            pid_data = data.get(pid, {
                "mem": [],
                "cpu": [],
                "pid" : pid,
                "running" : [],
                "name" : name,
                "container" : container
            })

            pid_data["mem"].append(mem)
            pid_data["cpu"].append(cpu)
            pid_data["running"].append(date)

            data[pid] = pid_data

    zero_cpu = []
    for k in data:
        summ = sum(data[k]["cpu"])
        if summ == 0.0 or data[k]["name"].startswith("debug"):
            zero_cpu.append(k)

    #for x in zero_cpu:
    #    del data[x]

    outpath = output + "usage" + os.path.sep
    
    outpathMem = outpath + "mem" + os.path.sep
    outpathCPU = outpath + "cpu" + os.path.sep

    os.makedirs(outpathMem, exist_ok=True)
    os.makedirs(outpathCPU, exist_ok=True)
    minTime = 9999999999999999
    for k in data:
        cpu = data[k]["cpu"]
        running = data[k]["running"]
        minTime = min(min(running),minTime)
        plt.plot(running, cpu, label = data[k]["name"][:40])
    if(transition is not None):
        for t in transition:
            if t[1]/1000 > minTime:
                plt.axvspan(t[1] / 1000, t[2]/1000, facecolor='r', alpha=0.2)    
    plt.legend() 
    plt.ylabel("CPU usage in %")
    #plt.ylim((0,5))
    plt.savefig(outpathCPU + server + "_" + str(run) + ".pdf")
    plt.savefig(outpathCPU + server + "_" + str(run) + ".jpg", dpi = 300)
    #plt.show()
    plt.close()

    for k in data:
        mem = data[k]["mem"]
        running = data[k]["running"]
        plt.plot(running, mem, label = data[k]["name"][:40])
    plt.ylabel("memory usage in %")
    plt.legend()   
    #plt.ylim((0,5))
    plt.savefig(outpathMem + server + "_" + str(run) + ".pdf")
    plt.savefig(outpathMem + server + "_" + str(run) + ".jpg", dpi = 300)
    #plt.show()
    plt.close()

In [None]:
for run in runs:
    p = path + run + os.path.sep + "logs" + os.path.sep + "usage" + os.path.sep
    servers = os.listdir(p)
    for server in tqdm(servers):
        file = open(p + server + os.path.sep + "ps.log", 'r') 
        lines = file.readlines() 
        file.close()
        extractUsage(lines, server, run, transitions[run])

## Responsibilities

shows already the server ID not kafka ID

In [None]:
transitions

In [None]:
names