In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import subprocess
import re


In [None]:
path = "streams/random_stream-n50000-N100000-a4.txt"
#paths = [file_name for file_name in os.listdir("datasets") if file_name.endswith('.txt')]

#param = 64
params = [16, 32, 64, 128, 256, 512]

num_executions = 30000

results = {param: {"REC": [], "HLL": [], "PCSA": []} for param in params}
#results = {path: {"REC": [], "HLL": [], "PCSA": []} for path in paths}


for param in params:
#for path in paths:
    for seed in range(1, num_executions + 1):

        p = subprocess.run(["./Cardinality", "-f", path, "-p", str(param), "--seed", str(seed)], capture_output=True, text=True)

        if p.returncode == 0:
            for line in p.stdout.splitlines():
                if line.startswith("Recordinality"):
                    results[param]["REC"].append(float(line.split(": ")[1])) # results[param]["REC"]
                elif line.startswith("HyperLogLog"):
                    results[param]["HLL"].append(float(line.split(": ")[1])) # results[param]["HLL"]
                elif line.startswith("Probabilistic Counting"):
                    results[param]["PCSA"].append(float(line.split(": ")[1])) # results[param]["PCSA"]
        else:
            print(f"Error with seed {seed}, path {path} and param {param}: {p.stderr}")


# Latex Table for comparison of Real vs Estimation 

In [None]:
averages = {}
for path in paths:
    averages[path] = {
        algo: np.mean(scores) for algo, scores in results[path].items()
    }

latex_output = ""
for path, scores in averages.items():
    with open("datasets/" + path.split('.')[0] + ".dat", 'r') as file:
        lines = file.readlines()
        line_count = len(lines)

        row = f"\\textbf{{{path.split('.')[0]}}} & \\textbf{{{line_count}}}"
        for algo, avg in scores.items():
            # error = np.std(results[path][algo], ddof=0) / line_count
            accuracy = 100 - (abs(avg - line_count) / line_count)*100
            row += f" & {avg:.0f} & {accuracy:.1f}\%"
        latex_output += row + " \\\\ \n"

print(latex_output)

# Latex Table for comparison parameters datasets

In [None]:
averages = {}
for param in params:
    averages[param] = {
        algo: np.mean(scores) for algo, scores in results[param].items()
    }

latex_output = ""
with open("datasets/" + path.split('.')[0] + ".dat", 'r') as file:
    lines = file.readlines()
    line_count = len(lines)
    latex_output = ""

    for param, scores in averages.items():
        row = f"\\textbf{{{param}}}"
        for algo, avg in scores.items():
            error = np.std(results[param][algo], ddof=0) / line_count
            # accuracy = 1 - (abs(avg - line_count) / line_count)
            row += f" & {avg:.0f} & {error:.2f}"
        latex_output += row + " \\\\ \n"

print(latex_output)

# Latex Table for comparison parameters random streams


In [None]:
averages = {}
for param in params:
    averages[param] = {
        algo: np.mean(scores) for algo, scores in results[param].items()
    }


latex_output = ""

pattern = r'n(\d+)-N(\d+)-a(\d+)\.txt'
match = re.search(pattern, path)
if match:
    n = int(match.group(1))
    N = int(match.group(2))
    alpha = int(match.group(3))

    for param, scores in averages.items():
        row = f"\\textbf{{{param}}}"
        for algo, avg in scores.items():
            error = np.std(results[param][algo], ddof=0) / n
            # accuracy = 1 - (abs(avg - line_count) / line_count)
            row += f" & {avg:.0f} & {error:.2f}"
        latex_output += row + " \\\\ \n"

print(latex_output)