In [1]:
import matplotlib.pyplot as plt
import networkx as nx
import networkx.algorithms.isomorphism as iso
import sympy
import random
import time
import itertools
import math
from IPython.display import clear_output

In [2]:
def graphlet_list(N):
    assert N > 0
    foo = 1
    loc_graphlet_list = {n: [] for n in range(1,N+1)}
    while True:
        G = nx.graph_atlas(foo)
        n = G.number_of_nodes()
        if n>N:
            break
        if nx.is_connected(G):
            loc_graphlet_list[n].append(G)
        foo += 1
    return loc_graphlet_list
    
def find_type_match(T):
    
    n = T.number_of_nodes()
    if n==1:
        return((0, {u: 0 for u in T.nodes()}))
    if n==2:
        return((0, {u: i for i,u in enumerate(T.nodes())}))
    if n==3:
        if T.number_of_edges()==2:
            u0 = next((node for node in T.nodes() if T.degree(node)==2))
            (u1,u2) = (node for node in T.neighbors(u0))
            return((0, {u0: 0, u1: 1, u2: 2}))
        if T.number_of_edges()==3:
            return((1,{u:i for i,u in enumerate(T.nodes())}))
    if n==4:
        e_num = T.number_of_edges()
        max_degree = max((T.degree(node) for node in T.nodes()))
        if e_num==3 and max_degree==3:
            u3 = next((node for node in T.nodes() if T.degree(node)==3))
            (u0,u1,u2) = (node for node in T.neighbors(u3))
            return((0, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==3 and max_degree==2:
            (u0,u1) = (node for node in T.nodes() if T.degree(node)==2)
            u2 = next((node for node in T.neighbors(u1) if node!=u0))
            u3 = next((node for node in T.neighbors(u0) if node!=u1))
            return((1, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==4 and max_degree==3:
            u3 = next((node for node in T.nodes() if T.degree(node)==3))
            (u1,u2) = (node for node in T.nodes() if T.degree(node)==2)
            u0 = next((node for node in T.nodes() if T.degree(node)==1))
            return((2, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==4 and max_degree==2:
            u0 = next((node for node in T.nodes()))
            (u1,u3) = (node for node in T.neighbors(u0))
            u2 = next((node for node in T.neighbors(u1) if node!=u0))
            return((3, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==5:
            (u0,u2) = (node for node in T.nodes() if T.degree(node)==3)
            (u1,u3) = (node for node in T.nodes() if T.degree(node)==2)
            return((4, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==6:
            (u0,u1,u2,u3) = (node for node in T.nodes())
            return((5, {u0:0, u1:1, u2:2, u3:3}))
    # Improve matching procedure here for n>4.
    GM = next((i, iso.GraphMatcher(T,T_)) 
              for (i,T_) in enumerate(cached_graphlet_list[n]) 
              if iso.GraphMatcher(T,T_).is_isomorphic())
    assert GM[1].is_isomorphic()    
    return((GM[0],GM[1].mapping))

def find_type_prob(G,T_vert):
    k = len(T_vert)
    T_type, T_match = find_type_match(subgraph(G,T_vert)) 
    inv_match = {i: j for j,i in T_match.items()}
    degree_list = [G.degree(inv_match[i]) for i in range(k)]
    T_prob = cached_prob[T_type](*degree_list)*(cached_edge_number)**(-1)
    return (T_type, T_prob)

def find_type(T, v_num, e_num):
    if v_num == 1:
        return 0
    if v_num == 2:
        return 0
    if v_num == 3:
        if e_num==2:
            return 0
        if e_num==3:
            return 1
    if v_num==4:
        max_degree = max((T.degree(node) for node in T.nodes()))
        if e_num==3 and max_degree==3:
            return 0
        if e_num==3 and max_degree==2:
            return 1
        if e_num==4 and max_degree==3:
            return 2
        if e_num==4 and max_degree==2:
            return 3
        if e_num==5:
            return 4
        if e_num==6:
            return 5
    # Improve matching procedure here for n=5
    GM = next((i 
              for (i,T_) in enumerate(cached_graphlet_list[n]) 
              if iso.GraphMatcher(T,T_).is_isomorphic()))
    return GM  

def prob_functions(N):
    assert N > 0
    x = {0: sympy.var('x_0')}
    y = {0: sympy.var('y_0')}
    prob = {1: {0: x[0]/2}}
    if N > 1:
        x[1] = sympy.var('x_1')        
        y[1] = sympy.var('y_1')
        prob[2] = {0: sympy.Integer(1)}
    for n in range(3, N+1):
        x[n-1] = sympy.var('x_{}'.format(n-1))
        y[n-1] = sympy.var('y_{}'.format(n-1))
        prob[n] = {}
        for T_ind, T in enumerate(cached_graphlet_list[n]):
            prob[n][T_ind] = 0
            for u in T.nodes():
                S = subgraph(T, T.nodes()-{u})
                if not nx.is_connected(S):
                    continue
                S_ind, S_match = find_type_match(S)
                S_prob = (prob[n-1][S_ind]
                          .subs({x[i]:y[i] for i in range(n-1)})
                          .subs({y[j]:x[i] for i,j in S_match.items()})
                         )
                S_deg = sum(x[i] for i in S.nodes()) - 2*S.number_of_edges()
                prob[n][T_ind] += S_prob * T.degree(u) / S_deg                 
    return prob[N]

def subgraph(G, nodes):
    list_nodes = list(nodes)
    T = nx.Graph()
    T.add_nodes_from(nodes)
    for i, node in enumerate(list_nodes):
        neighbors = G.neighbors(node)
        for j in range(i, len(list_nodes)):
            if list_nodes[j] in neighbors:
                T.add_edge(node, list_nodes[j])
    return T


In [3]:
def lift(G, vert, k):
    graphlet = set([vert])
    if k==1:
        return graphlet
    u = vert
    neig_list = []
    for n in range(2, k+1):
        neig_list = ([v for v in neig_list if v!=u] 
                     + [v for v in G.neighbors(u) if v not in graphlet])
        u = random.choice(neig_list)
        graphlet.add(u)
    return graphlet

def vertex_prob(vert_degree):
    return vert_degree*(2*cached_edge_number)**(-1)

def lift_shotgun(G, vert, k):
    vert_neigh = list(G.neighbors(vert))
    prob = vertex_prob(len(vert_neigh))
    graphlet = set([vert])
    e_num = 0
    if k==1:
        return (graphlet, prob, e_num, vert_neigh)
    u = vert
    neigh_list = vert_neigh
    for n in range(2, k):
        u = random.choice(neigh_list)
        vert_neigh = list(G.neighbors(u))
        S_degree = len([v for v in vert_neigh if v in graphlet])
        prob = prob * S_degree * (len(neigh_list))**(-1)
        neigh_list = ([v for v in neigh_list if v!=u] 
                     + [v for v in vert_neigh if v not in graphlet])
        graphlet.add(u)
        e_num += S_degree
    return (graphlet, e_num, prob, neigh_list)

def random_walk_nodes(G, v0, steps_num):
    curr_vert = v0
    for _ in range(steps_num):
        curr_vert = random.choice(list(G.neighbors(curr_vert)))
    return curr_vert

def load_graph(name, k=3):
    ground_truth = None
    G = None
    if name=='bio-celegansneural':
        G = nx.read_edgelist(
            'Graphs/bio-celegansneural.mtx',
            create_using = nx.Graph(), data=(('weight',float),))
        if k==3:
            ground_truth = {0: 44081, 
                            1: 3241}
    if name=='ia-email-univ':
        G = nx.read_edgelist(
            'Graphs/ia-email-univ.mtx',
            create_using = nx.Graph())
        if k==3:
            ground_truth = {0: 80386, 
                            1: 5343}
    if name=='misc-polblogs':
        G = nx.read_edgelist(
            'Graphs/misc-polblogs.mtx',
            create_using = nx.Graph(), data=(('weight',float),))
        if k==3:
            ground_truth = {0: 1038396, 
                            1: 101096}
    if name=='misc-as-caida':
        G = nx.read_edgelist(
            'Graphs/misc-as-caida.mtx',
            create_using = nx.Graph(), data=(('weight',float),)) 
        if k==3:
            ground_truth = {0: 14797175, 
                            1: 36365}
    if name=='misc-fullb':
        G = nx.read_edgelist(
            'Graphs/misc-fullb.mtx',
            create_using = nx.Graph())
        for v in G.nodes():
            G.remove_edge(v,v)
        if k==3:
            ground_truth = {0: 162067420, 
                            1: 60212260}
        if k==4:
            ground_truth = {0: 1078734774,
                            1: 4837795036,
                            2: 2707584768,
                            3: 64898820,
                            4: 897215295,
                            5: 370980150}
    if G is None:
        raise KeyError

    return {'graph': G, 'ground_truth': ground_truth}

def lift_mixing_variance(G, k, steps_num=1000, burn_in_limit=20):
    v = random_walk_nodes(G,random.choice(list(G.nodes())),100)
    graphlet_num = len(cached_graphlet_list[k])
    exp_counter = {i:0 for i in range(graphlet_num)}
    type_counter = {i:0 for i in range(graphlet_num)}
    var_counter = {i:0 for i in range(graphlet_num)}
    variance = {i:0 for i in range(graphlet_num)}
    expectation = {i:0 for i in range(graphlet_num)}
    pair_counter = {i: 
                    {burn_in: 0 
                     for burn_in in range(0,burn_in_limit)}
                    for i in range(graphlet_num)} 
    corr_counter = {i: 
                    {burn_in: 0 
                     for burn_in in range(0,burn_in_limit)}
                    for i in range(graphlet_num)} 
    memory = [None for _ in range(burn_in_limit)]
    for _ in range(steps_num):
        T = lift(G, v, k)
        v = random_walk_nodes(G, v, 1)
        T_type, T_prob = find_type_prob(G, T)
        type_counter[T_type] += 1
        exp_counter[T_type] += (T_prob)**(-1)        
        var_counter[T_type] += (T_prob)**(-2)
        ind = 0
        while ind < burn_in_limit and memory[ind] is not None:
            S_type, S_prob = memory[ind]
            if T_type==S_type:
                pair_counter[T_type][ind] += 1
                corr_counter[T_type][ind] += (T_prob*S_prob)**(-1)
            ind+=1
        memory = [(T_type, T_prob)] + memory[:-1]

    for i in range(graphlet_num):
        expectation[i] = exp_counter[i]*steps_num**(-1)
        variance[i] = (var_counter[i]*steps_num**(-1)
                       - expectation[i]**2)

    correlation = {i: [(corr_counter[i][burn_in]*(steps_num-burn_in)**(-1)- expectation[i]**2)
                       *(variance[i])**(-1)
                       for burn_in in range(burn_in_limit)] 
                   for i in range(graphlet_num) 
                   if variance[i]!=0}

    print("Expectation")
    for i in range(graphlet_num):    
        print(expectation[i])

    print("Normalized Variance")
    for i in range(graphlet_num):
        if expectation[i]!=0:
            print variance[i]*expectation[i]**(-2)
        else:
            print "No graphlets found"

    for i in range(graphlet_num):
        print ("Correlation for Graphlet ID{}".format(i+1))
        if expectation[i]!=0:
            for burn_in, val in enumerate(correlation[i]):
                print "({0}, {1:.5f})".format(burn_in+1, val)
        else:
            print "No graphlets found"
    return (correlation, variance)

def lift_count_shotgun(G, k, steps_num, burn_in, ground_truth):
    if k==4:
        co = {0:12, 1:8, 2:12, 3:16, 4:20, 5:24}
    if k==3:
        co = {0:4, 1:6}
    if k==2:
        co = {0:2}
    v = random_walk_nodes(G,random.choice(list(G.nodes())),100)
    graphlet_num = len(cached_graphlet_list[k])
    exp_counter = {i:0 for i in range(graphlet_num)}
    errors = {i:[] for i in range(graphlet_num)}
    for step in range(1,steps_num+1):
        S, S_enum, S_prob, neigh_list = lift_shotgun(G, v, k)
        neigh_set = set(neigh_list)
        for u in neigh_set:
            T = G.subgraph(S.union({u}))
            T_enum = S_enum + neigh_list.count(u)
            T_type = find_type(T, v_num=k, e_num=T_enum)
            exp_counter[T_type] += (S_prob)**(-1)
        v = random_walk_nodes(G, v, burn_in)
        if step%100==0:
            for i in range(graphlet_num):
                errors[i].append(
                    abs(exp_counter[i]*(step*co[i]*ground_truth[i])**(-1) -1)
                )
    return errors

def brute_force(G, k=3):
    type_counter = {i:0 for i in range(len(cached_graphlet_list[k]))}
    if k==3:
        percent_count = 0
        counter = 0
        for u,v in G.edges():
            for w in set(G.neighbors(u))-{v}:
                if w in G.neighbors(v): 
                    type_counter[1] += 1
                else:
                    type_counter[0] += 1
            for w in set(G.neighbors(v))-{u}:
                if w in G.neighbors(u): 
                    type_counter[1] += 1
                else:
                    type_counter[0] += 1
            counter += 1
            if counter > percent_count*0.01*cached_edge_number:
                clear_output()
                print("{}% complete".format(percent_count))
                percent_count += 1
        type_counter[0] = type_counter[0]/2
        type_counter[1] = type_counter[1]/6
    return type_counter

In [4]:
k=3
x = [sympy.var('x_{}'.format(i)) for i in range(k)]
cached_graphlet_list = graphlet_list(k)
cached_prob = {ind: sympy.lambdify(x, func) 
                    for ind, func in prob_functions(k).items()
              }
#4s for N=5, 2m43s for N=6

In [6]:
load_gr = load_graph('misc-polblogs',k)
G = load_gr['graph']
ground_truth = load_gr['ground_truth']
cached_edge_number = G.number_of_edges()
cached_vert_number = G.number_of_nodes()
print(cached_vert_number, cached_edge_number)

(1224, 16718)


In [7]:
epoch_num = 100
steps_limit = 1000
graphlet_num = len(cached_graphlet_list[k])
step_range = list(range(0, steps_limit+1, 100)[1:])
error_sum = {i: [0]*(len(step_range)) for i in range(graphlet_num)}
for epoch in range(epoch_num):
    errors = lift_count_shotgun(G, 3, 
                                steps_num=steps_limit, 
                                burn_in=3, 
                                ground_truth=ground_truth)
    for i in range(graphlet_num):
        for (num, val) in enumerate(errors[i]):
            error_sum[i][num] += val
for i in range(graphlet_num):
    print("Errors for Graphlet ID{}".format(i+1))
    for (num, step) in enumerate(step_range):
        print("({0:}, {1:.5f})".format(step, error_sum[i][num]*(epoch_num)**(-1)))
        

Errors for Graphlet ID1
(100, 0.05534)
(200, 0.04402)
(300, 0.03885)
(400, 0.03540)
(500, 0.03173)
(600, 0.02859)
(700, 0.02693)
(800, 0.02598)
(900, 0.02590)
(1000, 0.02572)
Errors for Graphlet ID2
(100, 0.10181)
(200, 0.06865)
(300, 0.05528)
(400, 0.04821)
(500, 0.04536)
(600, 0.04341)
(700, 0.03955)
(800, 0.03754)
(900, 0.03675)
(1000, 0.03574)


In [19]:
lift_mixing_variance(G, k, steps_num=10**7) #misc-fullb

Expectation
1076580410.99
4842167983.19
2710131800.24
64643653.7485
896641833.555
370912652.283
Normalized Variance
21.6007145733
6.01186406081
2.57636581724
128.798825901
3.21553512349
3.80707221099
Correlation for Graphlet ID1
(1, 0.00921)
(2, 0.00812)
(3, 0.00598)
(4, 0.00549)
(5, 0.00403)
(6, 0.00433)
(7, 0.00393)
(8, 0.00343)
(9, 0.00341)
(10, 0.00296)
(11, 0.00270)
(12, 0.00322)
(13, 0.00293)
(14, 0.00273)
(15, 0.00248)
(16, 0.00245)
(17, 0.00218)
(18, 0.00180)
(19, 0.00257)
(20, 0.00186)
Correlation for Graphlet ID2
(1, 0.00786)
(2, 0.00662)
(3, 0.00613)
(4, 0.00531)
(5, 0.00437)
(6, 0.00403)
(7, 0.00441)
(8, 0.00377)
(9, 0.00310)
(10, 0.00381)
(11, 0.00310)
(12, 0.00295)
(13, 0.00269)
(14, 0.00242)
(15, 0.00227)
(16, 0.00216)
(17, 0.00203)
(18, 0.00208)
(19, 0.00250)
(20, 0.00200)
Correlation for Graphlet ID3
(1, 0.01361)
(2, 0.01035)
(3, 0.00842)
(4, 0.00739)
(5, 0.00680)
(6, 0.00592)
(7, 0.00545)
(8, 0.00513)
(9, 0.00468)
(10, 0.00460)
(11, 0.00392)
(12, 0.00358)
(13, 0.00380

({0: [0.00921007543259128,
   0.008122700710081929,
   0.005976423596121746,
   0.005487561949758606,
   0.004034040176952304,
   0.0043298838602481285,
   0.0039254438250433,
   0.0034291967519360844,
   0.0034077233731348,
   0.002958347641165646,
   0.0027011577633102722,
   0.003215504823720034,
   0.002931052932653457,
   0.0027278859353612787,
   0.0024802697854064645,
   0.002447541549261612,
   0.0021813331515426384,
   0.0017999196782160604,
   0.0025734903458888947,
   0.0018643495886506612],
  1: [0.00786125413935401,
   0.006622426383090961,
   0.006134456403674279,
   0.005312445283445107,
   0.004368588322218333,
   0.004027444186552542,
   0.00441068351338473,
   0.003767365217777373,
   0.0030988015222914588,
   0.003813728747369672,
   0.0030959722840299235,
   0.0029480693712015675,
   0.0026927398795320826,
   0.0024247773901893885,
   0.002266883806120828,
   0.0021626678752130024,
   0.002028570432853392,
   0.0020819278857534184,
   0.00250190131912754,
   0.00200

In [16]:
lift_mixing_variance(G, k, steps_num=10**7) #bio-celegansneural

Expectation
651267.939651
516574.474623
186157.705
15849.2162466
22721.295104
2016.39711593
Normalized Variance
6.88849614353
5.46429330379
5.55277876024
48.2519764125
16.2958101515
104.149168047
Correlation for Graphlet ID1
(1, 0.08235)
(2, 0.05926)
(3, 0.02935)
(4, 0.02344)
(5, 0.01616)
(6, 0.01266)
(7, 0.00933)
(8, 0.00788)
(9, 0.00579)
(10, 0.00433)
(11, 0.00373)
(12, 0.00296)
(13, 0.00256)
(14, 0.00165)
(15, 0.00152)
(16, 0.00134)
(17, 0.00087)
(18, 0.00094)
(19, -0.00000)
(20, 0.00117)
Correlation for Graphlet ID2
(1, 0.00493)
(2, 0.00397)
(3, 0.00122)
(4, 0.00131)
(5, 0.00094)
(6, 0.00129)
(7, 0.00054)
(8, 0.00006)
(9, 0.00009)
(10, 0.00008)
(11, 0.00047)
(12, 0.00034)
(13, -0.00007)
(14, 0.00019)
(15, -0.00006)
(16, -0.00024)
(17, -0.00039)
(18, 0.00023)
(19, -0.00043)
(20, 0.00051)
Correlation for Graphlet ID3
(1, 0.01692)
(2, 0.01482)
(3, 0.00964)
(4, 0.00746)
(5, 0.00591)
(6, 0.00496)
(7, 0.00328)
(8, 0.00310)
(9, 0.00241)
(10, 0.00222)
(11, 0.00179)
(12, 0.00136)
(13, 0.001

({0: [0.08234813291615234,
   0.05925634494426099,
   0.029353114886480687,
   0.02343896149230539,
   0.01615629538585082,
   0.012655054147911066,
   0.00932849658250875,
   0.007875702368576798,
   0.005792672210027533,
   0.004330783715410737,
   0.0037330933015588953,
   0.002956371782413846,
   0.0025574958777358566,
   0.0016543132274004313,
   0.0015217365215481368,
   0.0013409961435803834,
   0.0008665455464245941,
   0.0009398158834417421,
   -4.237039749019333e-06,
   0.0011688698546584422],
  1: [0.004931653080711473,
   0.003968454569160797,
   0.0012234124029257337,
   0.0013104808565717202,
   0.000936733529483267,
   0.0012873820271488519,
   0.0005366707915127475,
   5.658621506789981e-05,
   8.969695895966348e-05,
   8.280250497460508e-05,
   0.0004737974984991424,
   0.00034258070775810054,
   -6.776215830571883e-05,
   0.0001889951536969325,
   -6.494427300608427e-05,
   -0.00024217747555538592,
   -0.0003858891898662252,
   0.0002302580368994939,
   -0.00042900231

In [14]:
lift_mixing_variance(G, k, steps_num=10**7) #misc-polblogs

Expectation
39756389.7782
31442276.9592
15779382.1191
1130700.74517
2780642.81068
422690.570943
Normalized Variance
5.02870233217
6.22418351059
4.63318046191
41.1343213903
10.5407458861
40.1182859767
Correlation for Graphlet ID1
(1, 0.04017)
(2, 0.02090)
(3, 0.00616)
(4, 0.00389)
(5, 0.00117)
(6, 0.00132)
(7, 0.00020)
(8, 0.00063)
(9, 0.00015)
(10, 0.00047)
(11, 0.00022)
(12, 0.00075)
(13, -0.00000)
(14, -0.00007)
(15, 0.00061)
(16, -0.00048)
(17, -0.00050)
(18, 0.00003)
(19, -0.00031)
(20, 0.00013)
Correlation for Graphlet ID2
(1, 0.00089)
(2, 0.00169)
(3, 0.00141)
(4, 0.00123)
(5, 0.00055)
(6, 0.00022)
(7, 0.00015)
(8, 0.00057)
(9, -0.00000)
(10, 0.00033)
(11, 0.00014)
(12, 0.00074)
(13, 0.00040)
(14, 0.00015)
(15, 0.00026)
(16, 0.00021)
(17, -0.00041)
(18, -0.00018)
(19, 0.00001)
(20, -0.00007)
Correlation for Graphlet ID3
(1, 0.00590)
(2, 0.00749)
(3, 0.00373)
(4, 0.00368)
(5, 0.00255)
(6, 0.00183)
(7, 0.00189)
(8, 0.00147)
(9, 0.00109)
(10, 0.00142)
(11, 0.00077)
(12, 0.00079)
(13

({0: [0.040173260760518685,
   0.020903444224615583,
   0.006162978625567292,
   0.003892837518180663,
   0.001169645324656492,
   0.0013229024406416284,
   0.00019706532359435624,
   0.0006332918379221131,
   0.00015440278919547955,
   0.000467319955834885,
   0.00022485385237622408,
   0.0007453915048957225,
   -3.2351193444287906e-06,
   -6.554180002002733e-05,
   0.0006141526029783677,
   -0.00048270868486609644,
   -0.0005048159771579885,
   3.3012406891887935e-05,
   -0.0003133293531734743,
   0.000133777244808009],
  1: [0.0008882762138852043,
   0.0016860517732104488,
   0.0014066039883016716,
   0.0012308806563460302,
   0.0005531461330078343,
   0.00022494037426285687,
   0.0001535166311335701,
   0.0005683332151580991,
   -2.0871989919065518e-06,
   0.0003301422015468811,
   0.0001412695410482696,
   0.0007374317131166893,
   0.0003957087080575912,
   0.00014950113579378422,
   0.0002648113240372706,
   0.000205102467656348,
   -0.00040681157239406024,
   -0.0001828632906606

In [9]:
lift_mixing_variance(G, k, steps_num=10**6) #ia-email-univ k=4

Expectation
547887.417791
1101571.55201
216065.782393
12532.9920062
20675.4953373
3429.89054644
Normalized Variance
5.22968446811
3.29408438591
5.84446401148
92.5274026548
26.2681132984
70.0719729201
Correlation for Graphlet ID1
(1, 0.03750)
(2, 0.02414)
(3, 0.01440)
(4, 0.01052)
(5, 0.00713)
(6, 0.00616)
(7, 0.00328)
(8, 0.00691)
(9, 0.00369)
(10, 0.00345)
(11, 0.00406)
(12, 0.00186)
(13, 0.00105)
(14, 0.00045)
(15, 0.00223)
(16, -0.00074)
(17, 0.00124)
(18, -0.00055)
(19, -0.00072)
(20, 0.00031)
Correlation for Graphlet ID2
(1, 0.02680)
(2, 0.01919)
(3, 0.01386)
(4, 0.00947)
(5, 0.00642)
(6, 0.00579)
(7, 0.00465)
(8, 0.00272)
(9, 0.00211)
(10, 0.00086)
(11, 0.00136)
(12, 0.00099)
(13, 0.00073)
(14, 0.00211)
(15, 0.00128)
(16, 0.00075)
(17, 0.00045)
(18, -0.00070)
(19, 0.00055)
(20, -0.00164)
Correlation for Graphlet ID3
(1, 0.03579)
(2, 0.02577)
(3, 0.01759)
(4, 0.01348)
(5, 0.01110)
(6, 0.01096)
(7, 0.00835)
(8, 0.00679)
(9, 0.00625)
(10, 0.00525)
(11, 0.00459)
(12, 0.00479)
(13, -0

({0: [0.0375031694590886,
   0.024140585225645963,
   0.014396453168411245,
   0.010520785703859233,
   0.007131950839222019,
   0.006158382138379803,
   0.003278081380848234,
   0.006909319893064218,
   0.0036945811947603916,
   0.00344821461187178,
   0.004057039469516134,
   0.001857903101790716,
   0.001046756432183966,
   0.0004507182686988406,
   0.0022333536203319538,
   -0.0007420130254880523,
   0.0012442692406654703,
   -0.000546796796539446,
   -0.0007152265652923473,
   0.0003085238695906495],
  1: [0.026796813741888907,
   0.01919267377918568,
   0.013861020675229471,
   0.009468752012506656,
   0.006415154326594276,
   0.005791206571328875,
   0.004646083540355328,
   0.002716222556053679,
   0.0021102660845275494,
   0.0008599182330094624,
   0.0013616689169089475,
   0.0009935144786463266,
   0.0007282359277643766,
   0.0021104633851277994,
   0.0012762777969067213,
   0.0007538759555382139,
   0.0004502767259581605,
   -0.0007001353620322863,
   0.0005538865996413364,


In [12]:
lift_mixing_variance(G, k, steps_num=10**7) #misc-as-caida k=4

Expectation
7808296525.8
284848429.278
47028370.2105
400210.938854
1709408.20162
51632.6409433
Normalized Variance
4.06966279249
109.189409635
249.479073602
3823.38648073
2455.15344766
18488.0947963
Correlation for Graphlet ID1
(1, 0.52452)
(2, 0.42350)
(3, 0.32309)
(4, 0.27210)
(5, 0.21990)
(6, 0.19150)
(7, 0.16182)
(8, 0.14510)
(9, 0.12612)
(10, 0.11446)
(11, 0.10168)
(12, 0.09316)
(13, 0.08391)
(14, 0.07818)
(15, 0.07118)
(16, 0.06656)
(17, 0.06158)
(18, 0.05772)
(19, 0.05397)
(20, 0.05089)
Correlation for Graphlet ID2
(1, 0.00188)
(2, 0.00172)
(3, 0.00085)
(4, 0.00097)
(5, 0.00137)
(6, 0.00147)
(7, 0.00066)
(8, 0.00091)
(9, 0.00088)
(10, 0.00082)
(11, 0.00011)
(12, 0.00013)
(13, 0.00080)
(14, 0.00092)
(15, 0.00114)
(16, 0.00060)
(17, 0.00041)
(18, 0.00031)
(19, 0.00069)
(20, 0.00040)
Correlation for Graphlet ID3
(1, 0.00124)
(2, 0.00152)
(3, 0.00053)
(4, 0.00102)
(5, 0.00053)
(6, 0.00081)
(7, 0.00078)
(8, 0.00089)
(9, 0.00032)
(10, 0.00077)
(11, 0.00098)
(12, 0.00065)
(13, 0.00077)

({0: [0.5245202548336614,
   0.42350383949641396,
   0.3230873085529438,
   0.2720971128149509,
   0.21990484221174383,
   0.19150488248473868,
   0.16182219625285782,
   0.1450988269818642,
   0.12612262845538375,
   0.11446038578360224,
   0.10167501094650715,
   0.09316207740821582,
   0.08390856994704539,
   0.07818460012756444,
   0.0711801476994985,
   0.06656459458286795,
   0.06157722723496233,
   0.05772204944142978,
   0.0539728811687127,
   0.05089015231780983],
  1: [0.0018760981315526016,
   0.0017242535834908368,
   0.0008535130124386242,
   0.0009713196611911217,
   0.0013739280426938803,
   0.0014703803772853862,
   0.0006564238067155528,
   0.000905688619095265,
   0.0008838707251059612,
   0.0008202864318101606,
   0.00010641022591746012,
   0.00012757998544907985,
   0.0008004311074680423,
   0.0009218041494835241,
   0.0011378089269539124,
   0.0006033802451705601,
   0.0004069674257361414,
   0.0003096187290703441,
   0.0006926204451089961,
   0.0003999425690583699

In [None]:
#Garbage code
def liftSRW(G, k, time_limit=None, query_limit=None, epoch_num=1, time_step=10, 
            output_form='count', ground_truth=None, steps_mult=None, steps_num=None):

    assert (time_limit is None) != (query_limit is None)
    assert (steps_mult is None) != (steps_num is None)
    
    if steps_mult is not None:
        steps_between_lifts = int(steps_mult*math.log(G.number_of_nodes()))
    else:
        steps_between_lifts = steps_num
    norm_error = 0
    
    for epoch in range(epoch_num):
        #print('Starting epoch {}'.format(epoch+1))
        type_counter = {i:0 for i in range(len(cached_graphlet_list[k]))}
        t0 = time.time()
        lift_count = 0
        query_count = 0
        time_iter_count = 1
        stop_condition = False
        v = random.choice(list(G.nodes()))
        flag = 0

        while not stop_condition:
            v = random_walk_nodes(G, v, steps_between_lifts)
            T = lift(G, v, k)
            T_type, T_match = find_type_match(subgraph(G,T))
            inv_match = {i: j for j,i in T_match.items()}
            degree_list = [G.degree(inv_match[i]) for i in range(k)]
            T_prob = cached_prob[T_type](*degree_list)
            type_counter[T_type] += (T_prob)**(-1)
            lift_count += 1
            curr_time = time.time()
            type_list = []
            
            if curr_time - t0 > time_iter_count*time_step:
                if output_form=='count':
                    print("Time is {} Type counter is {}"
                          .format(int(curr_time-t0), 
                                  scale(type_counter, 
                                        G.number_of_edges()*lift_count**(-1))))
                if output_form=='ratio':
                    print("Time is {} Type ratios are {}"
                          .format(int(curr_time-t0), 
                                  normalize(type_counter)))
                print("Time is {} NMSE is {}"
                      .format(int(curr_time-t0), 
                              NMSE(type_counter, ground_truth)))
                print("Number of graphlets sampled is {}".format(lift_count))
                time_iter_count += 1
                
            if time_limit is not None:
                stop_condition = (time.time()-t0 > time_limit)
            if query_limit is not None:
                query_count += steps_between_lifts + k - 1
                stop_condition = (query_count > query_limit)

        if ground_truth is not None:
            error = NMSE(type_counter, ground_truth)
            #print("NMSE error is {}".format(error))
            norm_error += error
        #print("")
    print("Number of lifts is {}".format(lift_count))
    norm_error = norm_error*(epoch_num)**(-1)
    return {'count': scale(type_counter, G.number_of_edges()*lift_count**(-1)), 
            'NMSE': norm_error}

def brute_force(G, N=3):
    type_counter = {i:0 for i in range(len(cached_graphlet_list[N]))}
    if N==3:
        percent_count = 0
        counter = 0
        for u,v in G.edges():
            for w in set(G.neighbors(u))-{v}:
                if w in G.neighbors(v): 
                    type_counter[1] += 1
                else:
                    type_counter[0] += 1
            for w in set(G.neighbors(v))-{u}:
                if w in G.neighbors(u): 
                    type_counter[1] += 1
                else:
                    type_counter[0] += 1
            counter += 1
            if counter > percent_count*0.00001*cached_edge_number:
                clear_output()
                #print("{}% complete".format(percent_count*0.001))
                percent_count += 1
        type_counter[0] = type_counter[0]/2
        type_counter[1] = type_counter[1]/6
        
    if N==4:
        for u,v in G.edges():
            neigh = set(G.neighbors(u)).union(set(G.neighbors(v)))-{u,v}
            for w,z in itertools.combinations(neigh, 2):
                T = subgraph(G, {u,v,w,z})
                type_counter[find_type(T)] += 1
        type_counter[0] = type_counter[0]/3
        type_counter[1] = type_counter[1]
        type_counter[2] = type_counter[2]/3
        type_counter[3] = type_counter[3]/4
        type_counter[4] = type_counter[4]/5
        type_counter[5] = type_counter[5]/6
    return type_counter

def NMSE(dict_hat, dict_true):
    norm_dict_hat = normalize(dict_hat)
    norm_dict_true = normalize(dict_true)
    return sum(((norm_dict_hat[i]*freq**(-1) - 1)**2
                for i, freq in norm_dict_true.items() if freq != 0))

def normalize(dict_hat):
    total_count = sum((val for i,val in dict_hat.items()))
    return {i: val*(total_count)**(-1) for i,val in dict_hat.items()}

def scale(dict_hat, scalar):
    return {i: int(val*scalar) for i,val in dict_hat.items()}

def lift_variance(G, k, steps_num=1000, burn_in=3):
    v = random_walk_nodes(G,random.choice(list(G.nodes())),100)
    graphlet_num = len(cached_graphlet_list[k])
    variance_counter = {i:0 for i in range(graphlet_num)}
    expectation_counter = {i:0 for i in range(graphlet_num)}
    for _ in range(steps_num):
        T = lift(G, v, k)
        T_type, T_prob = find_type_prob(G,T)
        variance_counter[T_type] += (T_prob)**(-2)
        expectation_counter[T_type] += (T_prob)**(-1)
        v = random_walk_nodes(G,v,burn_in)
    norm_variance = {i: (variance_counter[i]**(0.5)
                         * steps_num**(-0.5))
                     for i in range(graphlet_num)}
    norm_expectation = {i: (expectation_counter[i]
                           * steps_num**(-1))
                     for i in range(graphlet_num)}
    return {'variance': norm_variance, 
            'expectation': norm_expectation}
