In [1]:
import random

The improved version is based on the base version of TRIEST. The changes are:
1. UpdateCounters is called unconditionally for each element on the stream, before the algorithm decides whether or not to insert the edge into S.
2. TRIEST-impr never decrements the counters when an edge is removed from S.
3. UpdateCounters performs a weighted increase of the counters using η(t) = max{1,(t − 1)(t − 2)/(M(M − 1))} as weight.

In [2]:
import time
class TRIEST_BASE:
    def __init__(self, M = 100):
        self.M = M
        self.S = set()
        self.global_counter = 0
        self.local_counter = {}
        self.t = 0

    def sample_edge(self, t):
        if t <= self.M:
            return True
        if random.random() <= (self.M/t):
            random_edge = random.sample(self.S, 1)[0]
            self.S.remove(random_edge)
            # remove the call to function update_counters
            # self.update_counters('-', Edge)
            return True
        return False

    # trièst-impr never decrements the counters when an edge is removed from S
    # operation only '+'
    # UpdateCounters performs a weighted increase of the counters
    def update_counters(self, t, edge):
        # edge E = (u,v)
        u = edge[0]
        v = edge[1]
        neighborhood_of_u = set()
        neighborhood_of_v = set()
        # construct neighborhood of u
        for one_edge in self.S:
            if u == one_edge[0]:
                neighborhood_of_u.add(one_edge[1])
            if u == one_edge[1]:
                neighborhood_of_u.add(one_edge[0])
            # construct neighborhood of v
            if v == one_edge[0]:
                neighborhood_of_v.add(one_edge[1])
            if v == one_edge[1]:
                neighborhood_of_v.add(one_edge[0])
        # shared neighborhood of u and v
        shared_neighborhood = set.intersection(neighborhood_of_u, neighborhood_of_v)
        # using the max of {1,(t−1)(t−2)/(M(M−1))} as weight.
        _weight = ((t - 1) * (t - 2)) / (self.M * (self.M - 1))
        weight = max(1, _weight)
        # update counters
        # replace “1” with the weight
        for c in shared_neighborhood:
            self.global_counter += weight
            self.local_counter[c] = self.local_counter.get(c, 0) + weight
            self.local_counter[u] = self.local_counter.get(u, 0) + weight
            self.local_counter[v] = self.local_counter.get(v, 0) + weight


    def run_triest_base(self, streams):

        for element in streams:
            self.t += 1
            # Update_Counters is called unconditionally for each element on the stream
            # move update_counter before the if block
            self.update_counters(self.t, element)
            if self.sample_edge(self.t):
                self.S.add(element)

        # eps = (self.t * (self.t - 1) * (self.t - 2)) / (self.M * (self.M - 1) * (self.M - 2))
        #
        # eps = max(1, eps)
        # print('Epsilon is ', eps)
        # estimation for the global triangle count
        #est_gc = eps * self.global_counter
        return self.global_counter



In [3]:
def undirected_edge(u,v):
    if u < v:
        return (u,v)
    else:
        return (v,u)

# import dataset
streams = set()

with open("CA-HepTh.txt") as f:
    for line in f:
        if line[0] == '#':
            continue
        edge = line.split()
        if edge[0] != edge[1]:
            streams.add(undirected_edge(edge[0], edge[1]))
        # if size_stream == 10000:
        #     break

print('The amount of edges of Data Stream contains', len(streams))


The amount of edges of Data Stream contains 25973


In [4]:
# set M >=6 & M < length of the dataset
# Estimate the global triangle count
t1 = time.time()
triest_base = TRIEST_BASE(3000)
print('the value of M is', triest_base.M)
glo_tri_counter = triest_base.run_triest_base(streams)
t1 = time.time() - t1
print('Running time for estimation is:', t1, 'seconds')
print('Estimation for the global triangle count is', glo_tri_counter)

# Get the true amount of global triangles
t2 = time.time()
triest_base = TRIEST_BASE(len(streams))
print('the value of M is', len(streams))
true_glo_tri_counter = triest_base.run_triest_base(streams)
t2 = time.time() - t2
print('Running time for actual global triangles amount is:', t2, 'seconds')
print('Actual global triangle count is', true_glo_tri_counter)

# Compare the estimation & Actual count
error_count = abs(true_glo_tri_counter - glo_tri_counter)
print('Error triangles are', error_count)
error_rate = error_count/ true_glo_tri_counter
print('Error rate is', error_rate)

the value of M is 3000
Running time for estimation is: 23.425249099731445 seconds
Estimation for the global triangle count is 28766.527782594203
the value of M is 25973
Running time for actual global triangles amount is: 48.8598690032959 seconds
Actual global triangle count is 28339
Error triangles are 427.527782594203
Error rate is 0.01508619861654268
