In [22]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import subprocess
import re


In [60]:
path = "streams/random_stream-n50000-N100000-a4.txt"
#paths = [file_name for file_name in os.listdir("datasets") if file_name.endswith('.txt')]

#param = 64
params = [16, 32, 64, 128, 256, 512]

num_executions = 30000

results = {param: {"REC": [], "HLL": [], "PCSA": []} for param in params}
#results = {path: {"REC": [], "HLL": [], "PCSA": []} for path in paths}


for param in params:
#for path in paths:
    for seed in range(1, num_executions + 1):

        p = subprocess.run(["./Cardinality", "-f", path, "-p", str(param), "--seed", str(seed)], capture_output=True, text=True)

        if p.returncode == 0:
            for line in p.stdout.splitlines():
                if line.startswith("Recordinality"):
                    results[param]["REC"].append(float(line.split(": ")[1])) # results[param]["REC"]
                elif line.startswith("HyperLogLog"):
                    results[param]["HLL"].append(float(line.split(": ")[1])) # results[param]["HLL"]
                elif line.startswith("Probabilistic Counting"):
                    results[param]["PCSA"].append(float(line.split(": ")[1])) # results[param]["PCSA"]
        else:
            print(f"Error with seed {seed}, path {path} and param {param}: {p.stderr}")


KeyboardInterrupt: 

# Latex Table for comparison of Real vs Estimation 

In [None]:
averages = {}
for path in paths:
    averages[path] = {
        algo: np.mean(scores) for algo, scores in results[path].items()
    }

latex_output = ""
for path, scores in averages.items():
    with open("datasets/" + path.split('.')[0] + ".dat", 'r') as file:
        lines = file.readlines()
        line_count = len(lines)

        row = f"\\textbf{{{path.split('.')[0]}}} & \\textbf{{{line_count}}}"
        for algo, avg in scores.items():
            # error = np.std(results[path][algo], ddof=0) / line_count
            accuracy = 100 - (abs(avg - line_count) / line_count)*100
            row += f" & {avg:.0f} & {accuracy:.1f}\%"
        latex_output += row + " \\\\ \n"

print(latex_output)

\textbf{mare-balena} & textbf{5670} & 5656 & 99.7\% & 5662 & 99.9\% & 5696 & 99.5\% \\ 
\textbf{dracula} & textbf{9425} & 9413 & 99.9\% & 9405 & 99.8\% & 9464 & 99.6\% \\ 
\textbf{quijote} & textbf{23034} & 23023 & 100.0\% & 23057 & 99.9\% & 23142 & 99.5\% \\ 
\textbf{crusoe} & textbf{6245} & 6246 & 100.0\% & 6234 & 99.8\% & 6274 & 99.5\% \\ 
\textbf{war-peace} & textbf{17475} & 17516 & 99.8\% & 17464 & 99.9\% & 17554 & 99.5\% \\ 
\textbf{midsummer-nights-dream} & textbf{3136} & 3136 & 100.0\% & 3136 & 100.0\% & 3157 & 99.3\% \\ 
\textbf{valley-fear} & textbf{5829} & 5854 & 99.6\% & 5827 & 100.0\% & 5858 & 99.5\% \\ 
\textbf{iliad} & textbf{8925} & 8913 & 99.9\% & 8912 & 99.9\% & 8964 & 99.6\% \\ 



# Latex Table for comparison parameters datasets

In [55]:
averages = {}
for param in params:
    averages[param] = {
        algo: np.mean(scores) for algo, scores in results[param].items()
    }

latex_output = ""
with open("datasets/" + path.split('.')[0] + ".dat", 'r') as file:
    lines = file.readlines()
    line_count = len(lines)
    latex_output = ""

    for param, scores in averages.items():
        row = f"\\textbf{{{param}}}"
        for algo, avg in scores.items():
            error = np.std(results[param][algo], ddof=0) / line_count
            # accuracy = 1 - (abs(avg - line_count) / line_count)
            row += f" & {avg:.0f} & {error:.2f}"
        latex_output += row + " \\\\ \n"

print(latex_output)

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/streams/random_stream-n5000-N40000-a4.dat'

# Latex Table for comparison parameters random streams


In [59]:
averages = {}
for param in params:
    averages[param] = {
        algo: np.mean(scores) for algo, scores in results[param].items()
    }


latex_output = ""

pattern = r'n(\d+)-N(\d+)-a(\d+)\.txt'
match = re.search(pattern, path)
if match:
    n = int(match.group(1))
    N = int(match.group(2))
    alpha = int(match.group(3))

    for param, scores in averages.items():
        row = f"\\textbf{{{param}}}"
        for algo, avg in scores.items():
            error = np.std(results[param][algo], ddof=0) / n
            # accuracy = 1 - (abs(avg - line_count) / line_count)
            row += f" & {avg:.0f} & {error:.2f}"
        latex_output += row + " \\\\ \n"

print(latex_output)

\textbf{16} & 19397 & 0.65 & 19377 & 0.27 & 19734 & 0.20 \\ 
\textbf{32} & 19476 & 0.42 & 19347 & 0.18 & 19524 & 0.14 \\ 
\textbf{64} & 19383 & 0.27 & 19364 & 0.13 & 19440 & 0.09 \\ 
\textbf{128} & 19370 & 0.17 & 19354 & 0.09 & 19390 & 0.07 \\ 
\textbf{256} & 19364 & 0.11 & 19348 & 0.06 & 19375 & 0.05 \\ 
\textbf{512} & 19350 & 0.07 & 19350 & 0.04 & 19360 & 0.03 \\ 

