## Open Questions

- wieso verliere ich Daten zwischen Produced und Filtered

### Imports

In [164]:
import pandas
import numpy as np
np.set_printoptions(suppress=True)

In [165]:
path = "~/work/data/test"
topic = "t18"
full_path = path + "/" + topic

In [166]:
produced_df = pandas.read_csv(full_path + "_produced.csv")
start_df = pandas.read_csv(full_path + ".csv")
filtered_df = pandas.read_csv(full_path + "_filtered.csv")
warnings_df = pandas.read_csv(full_path + "_warnings.csv")
modelchange_df = pandas.read_csv(full_path + "_modelchange.csv")

### Helper

In [248]:
def extractAvgMedStdMinMaxFromArray(diff):
    return np.average(diff), np.median(diff), np.std(diff), np.min(diff), np.max(diff)

### Information on the time difference between the arival time of records for all 3 topics

In [249]:
def avgMedStdArivalTime(df, column):
    df = df.to_numpy()[:,column].astype(int)
    #print(df)
    diff = df[1:,]-df[:-1,]
    
    return extractAvgMedStdMinMaxFromArray(diff)
    
def getArivalInfos():
    text = "%s:\nGot in average every %.2f ms new data, with median %.2f, and std = %.2f, further min was %d and max %d\n\n"
    print(text %("produced", *avgMedStdArivalTime(produced_df, 3)))
    print(text %("filtered", *avgMedStdArivalTime(filtered_df, 0)))
    print(text %("warning", *avgMedStdArivalTime(warnings_df, 0)))
    
getArivalInfos()

produced:
Got in average every 5.00 ms new data, with median 5.00, and std = 3.96, further min was 0 and max 393


filtered:
Got in average every 5.00 ms new data, with median 5.00, and std = 31.76, further min was 0 and max 8001


Got in average every 4471.23 ms new data, with median 434.50, and std = 9353.46, further min was 0 and max 40491




## Check Produced

### How long does it take until a produced record is acknowledged by kafka

In [250]:
def kafkaAck(df):
    ack = df.to_numpy()[:,1].astype(int)
    send = df.to_numpy()[:,3].astype(int)
    diff = send - ack
    
    return extractAvgMedStdMinMaxFromArray(diff)
    
print("Produced data was received by Kafka after: avg = %.2f ms; median %.2f ms; std = %.2f ms, further min was %d ms and max %d ms" %kafkaAck(produced_df))

Produced data was received by Kafka after: avg = 0.02 ms; median 0.00 ms; std = 0.25 ms, further min was 0 ms and max 42 ms


### check validity of produced data

In [169]:
def validate(df):
    df = df.to_numpy()
    lastRow = df[0]
    error = False
    for x in df[1:]:
        if(np.sum(x >= lastRow) != 4):
            error = True
            print("Error:")
            print(lastRow)
            print(x)
        if(lastRow[0] + 1 != x[0]):
            print("Offset %i increased not by 1" %lastRow[0])
            
        lastRow = x
    if(not error):
        print("Valid")
        
validate(produced_df)

Valid


## Data loss produced to filter

In [170]:
def calcDataLoss(df1, df1OffsetColumn, df2, df2OffsetColumn):
    df1Offsets = df1.to_numpy()[:,df1OffsetColumn].astype(int)
    df2Offsets = df2.to_numpy()[:,df2OffsetColumn].astype(int)
    
    errors = {}
    
    for x in df1Offsets:
        count = np.sum(df2Offsets == x)
        
        if(count != 1):
            errors[count] = errors.get(count, 0) + 1
    
    out = "Received records "
    for k, v in errors.items():
        out += "%ix for %i times; " %(k,v)
    print(out)
    
calcDataLoss(produced_df, 0, filtered_df, 3)    

Received records 0x for 91 times; 


## Latency produced - warning

In [253]:
def calcLatencyProduceWarning(modelchange_df, warnings_df, amplitude, periodLength):
    m = modelchange_df.to_numpy()
    w = warnings_df.to_numpy()
    lastChange = m[0]

    open = False
    s = None
    
    diff = []
    
    for x in m:
        if x[1] == amplitude and x[2] == periodLength:
            open = False
            
            # search warnings
            a = w[:,6] >= lastChange[0]

            b = w[:,7] <= x[0]
            
            inner = np.logical_and(a,b)
            
            leftOuter = np.logical_and(w[:,6] <= lastChange[0], w[:,7] >= lastChange[0])
            rightOuter = np.logical_and(w[:,6] <= x[0], w[:,7] >= x[0])

            fullRange = np.logical_or(leftOuter, inner)
            fullRange = np.logical_or(fullRange, rightOuter)
            
            if s is None:
                fullRange.astype(int)
                s = fullRange
            else:
                s = np.add(s,fullRange.astype(int))
            
            matching = w[fullRange]     
            
            #first recognition
            r = min(matching[:,0])
            diff.append(r - lastChange[0])
            
        else:
            #skip two changes without a reset
            if not open:
                lastChange = x
            open = True
        
    if not np.all(s == 1):
        print("Didn't match all recognized changes! With 0 weren't matched:")
        print(s)
        
    diff = np.array(diff)
    
    return extractAvgMedStdMinMaxFromArray(diff)

print("Anomalies were detected by Kafka after: avg = %.2f ms; median %.2f ms; std = %.2f ms, further min was %d ms and max %d ms" %calcLatencyProduceWarning(modelchange_df, warnings_df, 3, 1000))

Anomalies were detected by Kafka after: avg = 2633.90 ms; median 2026.50 ms; std = 1708.79 ms, further min was 1145 ms and max 7459 ms
