# Quantitative Accuracy Comparison Across Histogram Implementations

In [29]:
%run preamble.py
%matplotlib inline

from itertools import zip_longest
from circllhist import *
import numpy as np

# Data Import

In [37]:
!ls -l datasets

total 74512
-rw-r--r-- 1 jovyan users 17905959 May  8 19:58 1440-60.tsv
-rw-r--r-- 1 jovyan users 16981727 May  8 20:21 24-3600.tsv
-rw-r--r-- 1 jovyan users 37884449 May  8 19:58 4320-60.tsv
-rw-r--r-- 1 jovyan users   745516 May  8 19:58 60-60.tsv


In [38]:
COUNT=24
PERIOD=60*60
FILENAME="datasets/{}-{}.tsv".format(COUNT,PERIOD)

def parse_line(l):
    b, v = l.split("\t")
    return int(b), float(v)

raw_batches = [ [] for _ in range(COUNT + 1) ]
with open(FILENAME) as fh:
    for line in fh:
        batch_id, val = parse_line(line)
        if(batch_id >= COUNT):
            print(line)
        raw_batches[batch_id].append(val)

# Eliminate empty batches
raw_batches = list(filter(lambda l:len(l) > 0, raw_batches))

# Raw dataset

In [39]:
# Total R
raw_total = np.concatenate(raw_batches)

In [41]:
%%script false
## VISUALS ##
from matplotlib import pyplot as plt
import seaborn as sns
BATCH_SKIP=100
SKIP=50
AX_COUNT=int(COUNT/BATCH_SKIP)
fig = plt.figure(figsize=(30,.5*AX_COUNT))
fig.subplots_adjust(hspace = .8)
for i,batch in enumerate(raw_batches):
    if (i % BATCH_SKIP != 0): continue
    ax = plt.subplot(AX_COUNT,1,int(i/BATCH_SKIP)+1)
    ax.set_xlim(0,500)
    sns.rugplot(batch[::SKIP], ax=ax, alpha=0.5,height=1)

# Exact computations

For this example we will use the following statistics:

- mean
- median
- p95, p99, p99.9
- max

and compare the relative errors, of the merged data to the precise data.

In [42]:
stats = "mean median p95 p99 p999 p9999 max".split()

In [43]:
stats_np = {
    "mean"   : lambda R: np.mean(R),
    "median" : lambda R: np.percentile(R, 50),
    "p95"    : lambda R: np.percentile(R, 95),
    "p99"    : lambda R: np.percentile(R, 99),
    "p999"    : lambda R: np.percentile(R, 99.9),
    "p9999"    : lambda R: np.percentile(R, 99.99),
    "max"    : lambda R: np.percentile(R, 100),
}

In [44]:
stats_total = { k : stats_np[k](raw_total) for k in stats }

In [45]:
print("True stats of the merged dataset")

def relative_error_pct(true_val, val):
    delta = val - true_val
    return delta / true_val * 100

def p_head(prefix):
    print(prefix + "".join([ "{:>20}".format(k) for k in stats]))
def p_rec(prefix, rec):
    print(prefix + "".join([ "{:>20.3f}".format(rec[k]) for k in stats]))
def p_report(name, stats_t):
    stats_err = { k: relative_error_pct(stats_total[k], stats_t[k]) for k in stats }
    p_rec(name + "      ", stats_t)
    p_rec(name + " ERR% ", stats_err)
    
p_head("")
p_rec("", stats_total)

True stats of the merged dataset
                mean              median                 p95                 p99                p999               p9999                 max
             195.397              65.515             290.726            3468.665           18774.050           34244.469           41989.939


# Naive Aggregation: Mean values

In [46]:
# Row wise statistics:
SL = [ { k : stats_np[k](B) for k in stats }  for B in raw_batches ]

In [47]:
p_head("batch")
for i,S in enumerate(SL[:10]):
    p_rec("{:>5} ".format(i), S)
print("...")

batch                mean              median                 p95                 p99                p999               p9999                 max
    0               48.166              37.660              76.092             353.212            1385.642            3756.790            4477.232
    1               61.119              51.736              84.757             363.757            1883.959            3833.551            4639.391
    2               75.071              64.348              98.255             371.042            2858.847            4207.381            9142.573
    3              115.738              65.391             238.052            1338.533            5273.472            8206.849            8737.645
    4              127.681              72.844             302.986            1541.056            3907.465            5793.125            6720.831
    5              312.985              76.067             318.590            8527.715           10414.728           10

In [48]:
# Column Averages

In [49]:
stats_avg = { k : np.mean([ S[k] for S in SL ]) for k in stats }

p_head("             ")
p_rec("             ",stats_total)
p_report("Average", stats_avg)

                             mean              median                 p95                 p99                p999               p9999                 max
                          195.397              65.515             290.726            3468.665           18774.050           34244.469           41989.939
Average                   159.102              70.081             396.729            2259.214            5154.266            7237.445            8672.218
Average ERR%              -18.575               6.970              36.462             -34.868             -72.546             -78.865             -79.347


# HRD Histograms

In [50]:
def circllhist_from_valuelist(L):
    H = Circllhist()
    for v in L: H.insert(v)
    return H

In [51]:
HL = [ circllhist_from_valuelist(B) for B in raw_batches  ]

In [52]:
HT = Circllhist()
for H in HL:
    HT.merge(H)

In [53]:
stats_circllhist_f = { 
    "size" : lambda H: len(H.to_b64())/4*3,
    "mean" : lambda H: H.mean(),
    "median" : lambda H : H.quantile(.5),
    "p95" : lambda H : H.quantile(.95),
    "p99" : lambda H : H.quantile(.99),
    "p999" : lambda H : H.quantile(.999),
    "p9999" : lambda H : H.quantile(.9999),
    "max" :   lambda H : H.quantile(1),
}

In [54]:
stats_circllhist = { k : stats_circllhist_f[k](HT) for k in stats }

In [55]:
p_head("                ")
p_rec("                ",stats_total)
p_report("CIRCLLHIST", stats_circllhist)

                                mean              median                 p95                 p99                p999               p9999                 max
                             195.397              65.515             290.726            3468.665           18774.050           34244.469           41989.939
CIRCLLHIST                   195.472              65.513             290.663            3469.678           18728.847           34201.988           42000.000
CIRCLLHIST ERR%                0.038              -0.003              -0.022               0.029              -0.241              -0.124               0.024


# T-Digest

Python version can't do merges, so we use a JAVA.

In [56]:
!cat $FILENAME | java -jar tdigest-merge-tool.jar .5 .9 .95 .99 .999 .9999 1 | tee results.tsv

HistogramMergeTool
- Input Quantile: 0.500000
- Input Quantile: 0.900000
- Input Quantile: 0.950000
- Input Quantile: 0.990000
- Input Quantile: 0.999000
- Input Quantile: 0.999900
- Input Quantile: 1.000000
Merging  24 batches
size	1468
q0.500000	65.735235
q0.900000	114.754051
q0.950000	300.677732
q0.990000	3494.516689
q0.999000	18787.130406
q0.999900	34268.529730
q1.000000	41989.938660


In [57]:
print()
def import_results():
    with open("results.tsv") as fh:
        lines = fh.readlines()
    pairs = [ l.strip().split("\t") for l in lines ]
    return {
        k : float(v) for k,v in pairs
    }

res = import_results();

stats_td = {
    "size" : res["size"],
    "mean" : np.NAN, # not given
    "median" : res["q0.500000"],
    "p95"  : res["q0.950000"],
    "p99"  : res["q0.990000"],
    "p999" : res["q0.999000"],
    "p9999": res["q0.999900"],
    "max"  : res["q1.000000"],
}

p_report("TDigest", stats_td)


TDigest                       nan              65.735             300.678            3494.517           18787.130           34268.530           41989.939
TDigest ERR%                  nan               0.337               3.423               0.745               0.070               0.070               0.000


# Summary

In [59]:
print("\n\n### Merging {} batches @ {} seconds ###\n\n".format(COUNT, PERIOD))
p_head("                ")
p_rec("TOTAL:          ", stats_total)
print()
p_report("AVG       ", stats_avg)
print()
p_report("CIRCLLHIST", stats_circllhist)
print()
p_report("TDigest   ", stats_td)




### Merging 24 batches @ 3600 seconds ###


                                mean              median                 p95                 p99                p999               p9999                 max
TOTAL:                       195.397              65.515             290.726            3468.665           18774.050           34244.469           41989.939

AVG                          159.102              70.081             396.729            2259.214            5154.266            7237.445            8672.218
AVG        ERR%              -18.575               6.970              36.462             -34.868             -72.546             -78.865             -79.347

CIRCLLHIST                   195.472              65.513             290.663            3469.678           18728.847           34201.988           42000.000
CIRCLLHIST ERR%                0.038              -0.003              -0.022               0.029              -0.241              -0.124               0.024

TDigest  