In [49]:
from blackbox import BlackBox
import sys
import binascii
import random
import math
import time

In [106]:
# Hyperparameters to create hash functions
n_hash = 36
n_groups = 4
n_rows = int(n_hash/n_groups)
m = 300

In [107]:
# Generate values for the hash functions
hash_params = [[random.randint(1, 100), random.randint(1, 100)] for _ in range(n_hash)]

In [108]:
def myhashs(user):

    # Encode user to int
    user_int = int(binascii.hexlify(user.encode('utf8')),16)

    # Generate hash values
    result = []
    for f in hash_params:
        result.append((f[0] * user_int + f[1]) % m)
    
    return result

In [109]:
def count_trailing_zeroes(mystr):
    return len(str(mystr)) - len(str(mystr).rstrip('0'))

In [110]:
start_time = time.time()

# Read user inputs
input_filename = 'publicdata/users.txt'
stream_size = 30
num_of_asks = 1
output_filename = 'python2b.csv'

In [111]:
# Before beginning to iterate, write the column headers
with open(output_filename, "w") as f_out:
    f_out.write("Time,Ground Truth,Estimation")

In [113]:
# Blackbox
BB = BlackBox()

# Iterate over the asks
for ask_iteration in range(num_of_asks):
    stream_users = BB.ask(input_filename, stream_size)

    # Set to store all users seen
    seen_users_truth = set()

    # Lists to store the hash indexes and binary representations generated
    hash_idx = []
    hash_bin = []

    # Go over all users for this stream
    for user in stream_users:

        # Add the user to the set of seen users
        seen_users_truth.add(user)

        # Hash the user into values
        hashed_idxs = myhashs(user)

        # Store all has values for the current iteration (aka current user)
        iter_hash_bin = []

        # For the current user, get the hashed index and its binary representation
        for curr_idx in hashed_idxs:
            user_bin = bin(curr_idx)[2:]
            iter_hash_bin.append(curr_idx)

        # Add the hashed values from the curren iteration (current user) to the list of all hashes
        hash_bin.append(iter_hash_bin)
    print(hash_bin)
    # For each of the generated binary encoding of hash values, calculate the distance based on the number of trailing zeroes
    estimated_size_per_hash = []

    # Iterate through all hash functions
    for curr_hash in range(n_hash):
        curr_hash_max_zeroes = 0

        # Then, for a given hash function, go over the binary encodings generated for all users
        for curr_user in range(len(hash_bin)):

            # Count the number of trailing zeroes for the current user with the current hash
            curr_user_max_zeroes = count_trailing_zeroes(hash_bin[curr_user][curr_hash])

            # If it is longer than the previous max values for the current hash, then update the max value
            if curr_user_max_zeroes > curr_hash_max_zeroes:
                curr_hash_max_zeroes = curr_user_max_zeroes

        # Once all values have been estimarted for a given hash function, calculate the estimated size and append it to the list of estimates		
        estimated_size_per_hash.append(math.pow(2, curr_hash_max_zeroes))
    #print(estimated_size_per_hash)
    # Slices the estimate in "n_groups", then for each group calculate the group average
    group_avgs = []
    for group_idx in range(0, n_groups):
        group_sum = 0

        # Loop over the rows in the group
        for curr_row in range(0, n_rows):

            # Get the row index to be fetched from "estimated_size_per_hash" which has all estimates
            row_idx = group_idx*n_rows + curr_row

            # Fetch the estimate for the current row and add it to the sum of estimates for the current group
            group_sum += estimated_size_per_hash[row_idx]

        # Calcualte the average for the current group and append it to the list of all group averages
        group_avg = round(group_sum/n_rows)
        group_avgs.append(group_avg)
    
    # Get the median  value from the group averages by sorting them and taking the middle number
    group_avgs = sorted(group_avgs)
    distinct_users_prediction = group_avgs[int(n_groups/2)]

    # Then append the results to the output file
    with open(output_filename, "a") as f_out:
        f_out.write("\n"+str(ask_iteration)+","+str(len(seen_users_truth))+","+str(distinct_users_prediction))

# Measure the total time taken and report it
time_elapsed = time.time() - start_time
print('Duration: {}'.format(time_elapsed))

[[42, 130, 173, 128, 105, 130, 65, 88, 112, 55, 51, 269, 288, 44, 231, 50, 247, 0, 86, 238, 112, 58, 153, 246, 12, 246, 141, 73, 284, 263, 258, 231, 15, 34, 12, 32], [254, 10, 37, 156, 289, 206, 5, 72, 164, 15, 267, 165, 12, 68, 119, 118, 75, 20, 134, 38, 296, 190, 141, 282, 272, 74, 65, 145, 180, 23, 150, 247, 15, 146, 144, 56], [22, 130, 33, 148, 65, 270, 65, 248, 192, 155, 291, 109, 48, 104, 151, 270, 167, 100, 206, 138, 72, 238, 273, 186, 112, 166, 1, 253, 124, 263, 138, 71, 15, 114, 192, 92], [98, 70, 205, 192, 97, 218, 185, 180, 188, 135, 159, 117, 0, 56, 275, 34, 111, 260, 110, 38, 104, 274, 297, 114, 92, 110, 53, 109, 132, 143, 54, 139, 15, 290, 228, 44], [34, 10, 297, 76, 149, 246, 5, 32, 144, 215, 207, 205, 72, 128, 139, 138, 95, 220, 254, 138, 156, 70, 261, 222, 172, 94, 25, 25, 220, 23, 30, 287, 15, 126, 24, 116], [242, 130, 73, 228, 205, 230, 65, 288, 212, 255, 51, 69, 288, 44, 131, 250, 147, 200, 86, 38, 212, 58, 153, 246, 212, 146, 41, 73, 84, 263, 258, 31, 15, 134, 12, 