In [58]:
import random
import math
def trailing_zeroes(num, k):
    """Counts the number of trailing 0 bits in num."""
    if num == 0:
        return 2**5#k # Assumes 32 bit integer inputs!
    p = 0
    while (num >> p) & 1 == 0:
        p += 1
    return p

def estimate_cardinality_LL(stream, k):
    num_buckets = 2 ** k
    max_zeroes = [0] * num_buckets
    for h in stream:
        bucket = h & (num_buckets - 1) # Mask out the k least significant bits as bucket ID
        bucket_hash = h >> k
        max_zeroes[bucket] = max(max_zeroes[bucket], trailing_zeroes(bucket_hash, k))
    
    return 2 ** (float(sum(max_zeroes)) / num_buckets) * num_buckets * 0.79402

def estimate_cardinality_HLL(stream, k):
    num_buckets = 2 ** k
    max_zeroes = [0] * num_buckets
    for h in stream:
        bucket = h & (num_buckets - 1) # Mask out the k least significant bits as bucket ID
        bucket_hash = h >> k
        max_zeroes[bucket] = max(max_zeroes[bucket], trailing_zeroes(bucket_hash, k)+1)
    
    m = num_buckets
    if m == 16:
        alpha = 0.673
    elif m == 32:
        alpha = 0.697
    elif m == 64:
        alpha = 0.709
    else:
        alpha = 0.7213/(1 + 1.079/m)
            
    #Cardinality
    DV_est = alpha * m**2 * 1./sum([2**-i for i in max_zeroes])
        
     #Corrections:
    if DV_est < 5/2 * m: # small range correction
     
        V = sum([x == 0 for x in max_zeroes])#count_of_zero_registers( registers ) # the number of registers equal to zero
        if V == 0:  # if none of the registers are empty, use the HLL estimate
             DV = DV_est
        else:
             DV = m * math.log10(m/V)  # i.e. balls and bins correction
 
    if DV_est <= ( 1/30 * 2**32 ):  # intermediate range, no correction
          DV = DV_est
    if DV_est > ( 1/30 * 2**32 ):  # large range correction
         DV = -2**32 * math.log10( 1 - DV_est/2**32)
    
    return DV_est

In [59]:

random.seed(1943)
#Input data
N = 1000000
stream1 = [random.randint(0,2**32) for i in range (N)]

In [65]:
ks = [4,5,6,7,8,9,10,11,12,13,14,15,16]
ms = [2**k for k in ks]
Ns = [10**3,10**4,10**5,10**6,10**7]
streams = [[random.randint(0,2**32) for i in range (N)] for N in Ns]

In [66]:
card = [len(list(set(stream))) for stream in streams]
errors = []
for j in range(len(Ns)):
    err_ks = [1] * len(ks)

    for i in range(len(ks)):
        card_est = estimate_cardinality_LL(streams[j],ks[i])
    
        err_ks[i] = abs(card_est - card[j]) / card[j]
    errors.append(err_ks)

In [64]:
errors

[[0.7733323773764973,
  0.009725378501209775,
  0.02255463638213098,
  0.08336956521981119,
  0.11612160139562025,
  0.19914809453520366,
  0.495231862284078,
  1.2382954280929526,
  2.858513085831723,
  6.08613093612803,
  12.536426068453103,
  25.599428035691965,
  51.59074449268188],
 [0.14237385963897414,
  0.0705000568945141,
  0.25933304649216116,
  0.08803304393852558,
  0.0705000568945141,
  0.071950284616092,
  0.025114366470196364,
  0.009000147771387674,
  0.05771495645541072,
  0.30433915804072603,
  0.9034374191432231,
  2.172671611842997,
  4.763214593743325],
 [0.295287466800689,
  0.1056838851984033,
  0.09864994016047858,
  0.07113278523345434,
  0.06355703886428003,
  0.03263112336249192,
  0.03231841061723899,
  0.019816724315718747,
  0.016887099417038552,
  0.002289134290067741,
  0.0037653736248648784,
  0.02832999896798152,
  0.20298751521831204],
 [0.03406839883551537,
  0.2566488917240908,
  0.020439424124070717,
  0.1017389913578766,
  0.03406839883551537,
  0