In [1]:
import matplotlib.pyplot as plt
import networkx as nx
import networkx.algorithms.isomorphism as iso
import sympy
import numpy as np
import random
import time
import itertools
import math
from IPython.display import clear_output

In [2]:
def graphlet_list(N):
    assert N > 0
    foo = 1
    loc_graphlet_list = {n: [] for n in range(1,N+1)}
    while True:
        G = nx.graph_atlas(foo)
        n = G.number_of_nodes()
        if n>N:
            break
        if nx.is_connected(G):
            loc_graphlet_list[n].append(G)
        foo += 1
    return loc_graphlet_list
    

def find_type_match(T):
    n = T.number_of_nodes()
    if n==1:
        return((0, {u: 0 for u in T.nodes()}))
    if n==2:
        return((0, {u: i for i,u in enumerate(T.nodes())}))
    if n==3:
        if T.number_of_edges()==2:
            u0 = next((node for node in T.nodes() if T.degree(node)==2))
            (u1,u2) = (node for node in T.neighbors(u0))
            return((0, {u0: 0, u1: 1, u2: 2}))
        if T.number_of_edges()==3:
            return((1,{u:i for i,u in enumerate(T.nodes())}))
    if n==4:
        e_num = T.number_of_edges()
        max_degree = max((T.degree(node) for node in T.nodes()))
        if e_num==3 and max_degree==3:
            u3 = next((node for node in T.nodes() if T.degree(node)==3))
            (u0,u1,u2) = (node for node in T.neighbors(u3))
            return((0, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==3 and max_degree==2:
            (u0,u1) = (node for node in T.nodes() if T.degree(node)==2)
            u2 = next((node for node in T.neighbors(u1) if node!=u0))
            u3 = next((node for node in T.neighbors(u0) if node!=u1))
            return((1, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==4 and max_degree==3:
            u3 = next((node for node in T.nodes() if T.degree(node)==3))
            (u1,u2) = (node for node in T.nodes() if T.degree(node)==2)
            u0 = next((node for node in T.nodes() if T.degree(node)==1))
            return((2, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==4 and max_degree==2:
            u0 = next((node for node in T.nodes()))
            (u1,u3) = (node for node in T.neighbors(u0))
            u2 = next((node for node in T.neighbors(u1) if node!=u0))
            return((3, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==5:
            (u0,u2) = (node for node in T.nodes() if T.degree(node)==3)
            (u1,u3) = (node for node in T.nodes() if T.degree(node)==2)
            return((4, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==6:
            (u0,u1,u2,u3) = (node for node in T.nodes())
            return((5, {u0:0, u1:1, u2:2, u3:3}))
    # Improve matching procedure here for n>4.
    GM = next((i, iso.GraphMatcher(T,T_)) 
              for (i,T_) in enumerate(cached_graphlet_list[n]) 
              if iso.GraphMatcher(T,T_).is_isomorphic())
    assert GM[1].is_isomorphic()
    return((GM[0],GM[1].mapping))

def find_type(T):
    n = T.number_of_nodes()
    if n==1:
        return 0
    if n==2:
        return 0
    if n==3:
        if T.number_of_edges()==2:
            return 0
        if T.number_of_edges()==3:
            return 1
    if n==4:
        e_num = T.number_of_edges()
        max_degree = max((T.degree(node) for node in T.nodes()))
        if e_num==3 and max_degree==3:
            return 0
        if e_num==3 and max_degree==2:
            return 1
        if e_num==4 and max_degree==3:
            return 2
        if e_num==4 and max_degree==2:
            return 3
        if e_num==5:
            return 4
        if e_num==6:
            return 5
    # Improve matching procedure here at least for n=4.
    GM = next((i 
              for (i,T_) in enumerate(cached_graphlet_list[n]) 
              if iso.GraphMatcher(T,T_).is_isomorphic()))
    return GM  

def subgraph(G, nodes):
    list_nodes = list(nodes)
    T = nx.Graph()
    T.add_nodes_from(nodes)
    for i in range(len(nodes)):
        for j in range(i):
            if list_nodes[i] in G.neighbors(list_nodes[j]):
                T.add_edge(list_nodes[i],list_nodes[j])
    return T

def sub_edge_num(k, T_type):
    if k < 3:
        return k
    T = cached_graphlet_list[k][T_type]
    count = 0
    for u in T.nodes():
        S = subgraph(T, T.nodes()-{u})
        if not nx.is_connected(S):
            continue
        for v in S.nodes():
            if not nx.is_connected(subgraph(S, S.nodes()-{u})):
                continue
            if not nx.is_connected(subgraph(T, T.nodes()-{v})):
                continue
            count+=1
    return count

def lift(G, vert, k):
    graphlet = set([vert])
    if k==1:
        return graphlet
    u = vert
    neig_list = []
    for n in range(2, k+1):
        neig_list = ([v for v in neig_list if v!=u] 
                     + [v for v in G.neighbors(u) if v not in graphlet])
        u = random.choice(neig_list)
        graphlet.add(u)
    return graphlet

In [16]:
def SRW_step(G, graphlet):
    del_ins_list = []
    for u in graphlet:
        if not nx.is_connected(subgraph(G, graphlet-{u})):
            continue
        neigh = set()
        for v in graphlet-{u}:
            neigh.update(set(G.neighbors(v)))
        neigh = neigh - graphlet - {u}
        for w in neigh:
            del_ins_list.append((u,w))
    pair = random.choice(del_ins_list)
    new_graphlet = graphlet-{pair[0]}; new_graphlet.add(pair[1])
    return (new_graphlet, len(del_ins_list))

def load_graph(name, N=3):
    
    ground_truth = None
    G = None

    if name=='com-amazon':
        G = nx.read_edgelist(
            'Graphs/com-amazon.ungraph.txt',
            create_using = nx.Graph())
        
        if N==3:
            ground_truth = {0: 7750799, 
                            1: 667129}
        if N==4:
            ground_truth = {0: 124295537, 
                            1: 37383434, 
                            2: 13674662, 
                            3: 422515, 
                            4: 1874925, 
                            5: 275961}

    if name=='com-dblp':
        G = nx.read_edgelist(
            'Graphs/com-dblp.ungraph.txt',
            create_using = nx.Graph())
        if N==3:
            ground_truth = {0: 15107734, 
                            1: 2224385}
        if N==4:
            ground_truth = {0: 258570802, 
                            1: 252447350, 
                            2: 96615211, 
                            3: 203394, 
                            4: 4764685, 
                            5: 16713192}

    if name=='com-lj':
        G = nx.read_edgelist(
            'Graphs/com-lj.ungraph.txt',
            create_using = nx.Graph())
        if N==3:
            ground_truth = {0: 3722307805, 
                            1: 177820130}
        if N==4:
            ground_truth = {0: 1983908933796,
                            1: 542683013686,
                            2: 57662704306,
                            3: 2541452010,
                            4: 8190586835,
                            5: 521691844}

    if name=='com-youtube':
        G = nx.read_edgelist(
            'Graphs/com-youtube.ungraph.txt',
            create_using = nx.Graph())
        if N==3:
            ground_truth = {0: 1465313402, 
                            1: 3056386}
        if N==4:
            ground_truth = {0: 5730407268993,
                            1: 91488735459,
                            2: 12371157628,
                            3: 231979854,
                            4: 221833272,
                            5: 4986965}

    if name=='misc-net25':
        G = nx.read_edgelist(
            'Graphs/misc-net25.mtx',
            create_using = nx.Graph())
        for v in G.nodes():
            G.remove_edge(v,v)
        if N==3:
            ground_truth = {0: 12690840, 
                            1: 64090}
        if N==4:
            ground_truth = {0: 361490550,
                            1: 550792350,
                            2: 12554670,
                            3: 44915955,
                            4: 0,
                            5: 0}

    if name=='bio-celegansneural':
        G = nx.read_edgelist(
            'Graphs/bio-celegansneural.mtx',
            create_using = nx.Graph(), data=(('weight',float),))

    if name=='bio-yeast':
        G = nx.read_edgelist(
            'Graphs/bio-yeast.mtx',
            create_using = nx.Graph())
    
    if name=='bn-macaque-rhesus_brain_1':
        G = nx.read_edgelist(
            'Graphs/bn-macaque-rhesus_brain_1.edges',
            create_using = nx.Graph())
    
    if name=='bn-mouse-brain':
        G = nx.read_edgelist(
            'Graphs/bn-mouse_brain_1.edges',
            create_using = nx.Graph())
    
    if name=='ia-email-univ':
        G = nx.read_edgelist(
            'Graphs/ia-email-univ.mtx',
            create_using = nx.Graph())

    if name=='misc-polblogs':
        G = nx.read_edgelist(
            'Graphs/misc-polblogs.mtx',
            create_using = nx.Graph(), data=(('weight',float),))
        

    if name=='misc-as-caida':
        G = nx.read_edgelist(
            'Graphs/misc-as-caida.mtx',
            create_using = nx.Graph(), data=(('weight',float),)) 
        if N==3:
            ground_truth = {0: 59513652, 
                            1: 72730}
        if N==4:
            ground_truth = {0: 62565214368,
                            1: 2808802860,
                            2: 203097552,
                            3: 3774144,
                            4: 4084544,
                            5: 0}

    if name=='misc-fullb':
        G = nx.read_edgelist(
            'Graphs/misc-fullb.mtx',
            create_using = nx.Graph())
        for v in G.nodes():
            G.remove_edge(v,v)
        if N==3:
            ground_truth = {0: 162067420, 
                            1: 60212260}
        if N==4:
            ground_truth = {0: 1078734774,
                            1: 4837795036,
                            2: 2707584768,
                            3: 64898820,
                            4: 897215295,
                            5: 370980150}

    if name=='misc-neos3':
        G = nx.read_edgelist(
            'Graphs/misc-neos3.mtx',
            create_using = nx.Graph(), data=(('weight',float),))

        if N==3:
            ground_truth = {0: 207426691, 
                            1: 505603}
        if N==4:
            ground_truth = {0: 59618248397,
                            1: 11164704825,
                            2: 120388385,
                            3: 2047846,
                            4: 499122,
                            5: 0}

    if name=='misc-discogs_affiliation':
        G = nx.read_edgelist(
            'Graphs/misc-discogs_affiliation.edges',
            create_using = nx.Graph())
        if N==3:
            ground_truth = None
        if N==4:
            ground_truth = {0: 208345722513295,
                            1: 851118877585,
                            2: 58223406336,
                            3: 3008868833,
                            4: 439215089,
                            5: 654413}

    if name=='misc-amazon-ratings':
        G = nx.read_edgelist(
            'Graphs/misc-amazon-ratings.edges',
            create_using = nx.Graph())
        if N==3:
            ground_truth = {0: 699425719, 
                            1: 79638}
        if N==4:
            ground_truth = {0: 719668204837,
                            1: 40966346985,
                            2: 184396006,
                            3: 37045086,
                            4: 561566,
                            5: 671}

    if name=='misc-dbpedia-all':
        G = nx.read_edgelist(
            'Graphs/misc-dbpedia-all.edges',
            create_using = nx.Graph())
        if N==3:
            ground_truth = {0: 174250340949, 
                            1: 8329548}
        if N==4:
            ground_truth = {0: 19646604300441472,
                            1: 1652259549599,
                            2: 622928133900,
                            3: 15925209557,
                            4: 15630164176,
                            5: 4609834}
            
    if G is None:
        raise KeyError

    return {'graph': G, 'ground_truth': ground_truth}

def psrw_mixing_variance(G, k, steps_num=1000, burn_in_limit=20):
    v = random.choice(list(G.nodes()))
    init_graphlet = lift(G, v, k-1)
    old_graphlet = init_graphlet
    graphlet_num = len(cached_graphlet_list[k])
    exp_counter = {i:0 for i in range(graphlet_num)}
    var_counter = {i:0 for i in range(graphlet_num)}
    type_counter = {i:0 for i in range(graphlet_num)}
    pair_counter = {i: 
                    {burn_in: 0 
                     for burn_in in range(0,burn_in_limit)}
                    for i in range(graphlet_num)} 
    corr_counter = {i: 
                    {burn_in: 0 
                     for burn_in in range(0,burn_in_limit)}
                    for i in range(graphlet_num)} 
    expectation = {i:0 for i in range(graphlet_num)}
    variance = {i:0 for i in range(graphlet_num)}

    memory = [None for _ in range(burn_in_limit)]
    for _ in range(steps_num):
        new_graphlet = SRW_step(G, old_graphlet)[0]
        T = old_graphlet.union(new_graphlet)
        old_graphlet = new_graphlet
        assert len(T)==k
        T_type = find_type(subgraph(G, T))
        T_prob = cached_sub_edge_num[T_type]
        type_counter[T_type] += 1
        exp_counter[T_type] += (T_prob)**(-1)
        var_counter[T_type] += (T_prob)**(-2)
        ind = 0
        while ind < burn_in_limit and memory[ind] is not None:
            S_type, S_prob = memory[ind]
            if T_type==S_type:
                pair_counter[T_type][ind] += 1
                corr_counter[T_type][ind] += (T_prob*S_prob)**(-1)
            ind+=1
        memory = [(T_type, T_prob)] + memory[:-1]

    for i in range(graphlet_num):
        expectation[i] = exp_counter[i]*steps_num**(-1)
        variance[i] = (var_counter[i]*steps_num**(-1)
                       - expectation[i]**2)

    correlation = {i: [(corr_counter[i][burn_in]*(steps_num-burn_in)**(-1)- expectation[i]**2)
                       *(variance[i])**(-1)
                       for burn_in in range(burn_in_limit)] 
                   for i in range(graphlet_num) 
                   if variance[i]!=0}

    print("Expectation")
    for i in range(graphlet_num):    
        print(expectation[i])

    print("Normalized Variance")
    for i in range(graphlet_num):
        if expectation[i]!=0:
            print variance[i]*expectation[i]**(-2)
        else:
            print "No graphlets found"

    for i in range(graphlet_num):
        print ("Correlation for Graphlet ID{}".format(i+1))
        if expectation[i]!=0:
            for burn_in, val in enumerate(correlation[i]):
                print "({0}, {1:.5f})".format(burn_in+1, val)
        else:
            print "No graphlets found"
    return (correlation, variance)

def psrw_count(G, k, steps_num=1000, burn_in=10):
    v = random.choice(list(G.nodes()))
    init_graphlet = lift(G, v, k-1)
    old_graphlet = init_graphlet
    graphlet_num = len(cached_graphlet_list[k])
    exp_counter = {i:0 for i in range(graphlet_num)}

    for _ in range(steps_num):
        new_graphlet = SRW_step(G, old_graphlet)[0]
        T = old_graphlet.union(new_graphlet)
        old_graphlet = new_graphlet
        assert len(T)==k
        T_type = find_type(subgraph(G, T))
        T_prob = cached_sub_edge_num[T_type]
        exp_counter[T_type] += (T_prob)**(-1)

    exp_counter = {i: exp_counter[i]*(steps_num)**(-1)
                   for i in range(graphlet_num)}

    return exp_counter

In [4]:
k=4
cached_graphlet_list = graphlet_list(k)
cached_sub_edge_num = {T_type: sub_edge_num(k, T_type) 
                       for T_type in range(len(cached_graphlet_list[k])) 
                      } 


In [26]:
G = load_graph('misc-fullb',k)['graph']
print(G.number_of_nodes(), G.number_of_edges())

(199187, 5754445)


In [28]:
psrw_mixing_variance(G, k, steps_num=10**7) #misc-fullb

Expectation
0.0232174166662
0.1042483
0.0585123666677
0.000931133333333
0.0161340166667
0.00802857499999
Normalized Variance
6.17851900007
3.79624128163
1.84840070856
88.4966707241
4.16507048788
9.37959206117
Correlation for Graphlet ID1
(1, 0.18169)
(2, 0.05211)
(3, 0.03210)
(4, 0.02289)
(5, 0.01722)
(6, 0.01438)
(7, 0.01263)
(8, 0.01135)
(9, 0.01084)
(10, 0.00944)
(11, 0.00886)
(12, 0.00836)
(13, 0.00737)
(14, 0.00661)
(15, 0.00704)
(16, 0.00667)
(17, 0.00586)
(18, 0.00643)
(19, 0.00588)
(20, 0.00616)
Correlation for Graphlet ID2
(1, 0.15999)
(2, 0.01802)
(3, 0.00710)
(4, 0.00337)
(5, 0.00156)
(6, 0.00158)
(7, 0.00094)
(8, 0.00046)
(9, 0.00074)
(10, 0.00057)
(11, 0.00086)
(12, 0.00075)
(13, 0.00039)
(14, 0.00049)
(15, 0.00075)
(16, 0.00002)
(17, 0.00031)
(18, 0.00026)
(19, 0.00021)
(20, 0.00027)
Correlation for Graphlet ID3
(1, 0.04567)
(2, 0.00814)
(3, 0.00352)
(4, 0.00231)
(5, 0.00104)
(6, 0.00072)
(7, 0.00055)
(8, 0.00067)
(9, 0.00105)
(10, 0.00075)
(11, 0.00059)
(12, 0.00069)
(13

({0: [0.18168875938107726,
   0.05211112307999982,
   0.03210175860609236,
   0.02288816840562869,
   0.01722340525267209,
   0.014378521502065056,
   0.012627060356532514,
   0.01134766413790121,
   0.010843922701716108,
   0.009443590573833122,
   0.008863951402831244,
   0.008356039377522854,
   0.007373559623377508,
   0.006611265729192497,
   0.007035808007056573,
   0.00667468632943097,
   0.005856511567699027,
   0.006425342545192424,
   0.005882400303077994,
   0.006162654043727198],
  1: [0.15999118284538283,
   0.01802429082943994,
   0.007098750289734129,
   0.003373297419876968,
   0.0015639089789774862,
   0.0015815084992394741,
   0.000938604715805812,
   0.00046052366390905795,
   0.0007386886560902057,
   0.0005726802457083171,
   0.0008611467365860001,
   0.0007520991693967666,
   0.00038915153289893046,
   0.0004879504957482523,
   0.0007533903543162497,
   1.595507582300238e-05,
   0.0003129052213083249,
   0.00025657666316716165,
   0.00020691373146486887,
   0.0002

In [25]:
psrw_mixing_variance(G, k, steps_num=10**6) #bio-celegansneural

Expectation
0.101072166667
0.0802995
0.0290021666667
0.00163383333333
0.00296283333333
0.00031625
Normalized Variance
0.648986780034
5.22668883368
4.74669708582
50.0047944507
27.1262305226
262.504611331
Correlation for Graphlet ID1
(1, 0.39744)
(2, 0.17887)
(3, 0.15817)
(4, 0.14444)
(5, 0.13611)
(6, 0.12885)
(7, 0.12299)
(8, 0.11897)
(9, 0.11473)
(10, 0.11190)
(11, 0.10766)
(12, 0.10500)
(13, 0.10277)
(14, 0.10018)
(15, 0.09645)
(16, 0.09544)
(17, 0.09242)
(18, 0.08975)
(19, 0.08852)
(20, 0.08705)
Correlation for Graphlet ID2
(1, 0.26662)
(2, 0.09107)
(3, 0.08336)
(4, 0.07540)
(5, 0.07146)
(6, 0.06689)
(7, 0.06275)
(8, 0.05978)
(9, 0.05744)
(10, 0.05502)
(11, 0.05254)
(12, 0.05191)
(13, 0.04995)
(14, 0.04883)
(15, 0.04598)
(16, 0.04479)
(17, 0.04096)
(18, 0.04135)
(19, 0.03953)
(20, 0.03948)
Correlation for Graphlet ID3
(1, 0.22630)
(2, 0.04456)
(3, 0.03345)
(4, 0.02534)
(5, 0.02229)
(6, 0.01928)
(7, 0.01857)
(8, 0.01770)
(9, 0.01742)
(10, 0.01575)
(11, 0.01483)
(12, 0.01407)
(13, 0.01

({0: [0.39744088097555963,
   0.17886649461521445,
   0.15817450274225567,
   0.1444376417099657,
   0.13610568160274114,
   0.12885050172111748,
   0.1229863459471309,
   0.11896991532435063,
   0.11473141283066357,
   0.11190489409075965,
   0.10766218616277569,
   0.10499906065637352,
   0.10277167947516039,
   0.10017977213151555,
   0.09644820438994403,
   0.09543588276438658,
   0.09241658874896123,
   0.08974505217478773,
   0.08852322452718334,
   0.08704999864637201],
  1: [0.2666179912322408,
   0.0910709592356655,
   0.08336390909782992,
   0.07539721225760411,
   0.0714584974981119,
   0.06688924047148628,
   0.06275022173184185,
   0.059783249217834164,
   0.05743938897721922,
   0.055021343235685784,
   0.05253652985940886,
   0.051906235551428044,
   0.04994809944379647,
   0.048828205961519044,
   0.04597988865070645,
   0.04479322582791559,
   0.04095828706401559,
   0.041351680807016664,
   0.039534467209123035,
   0.03947535282971337],
  2: [0.22629872367360426,
   0

In [23]:
psrw_mixing_variance(G, k, steps_num=10**6) #misc-polblogs

Expectation
0.090874666667
0.0716615
0.0359195
0.00172991666667
0.00527533333333
0.000987416666667
Normalized Variance
0.834027818501
5.97724719689
3.64000519678
47.1718772581
14.7967900923
83.3953076209
Correlation for Graphlet ID1
(1, 0.39142)
(2, 0.17593)
(3, 0.14196)
(4, 0.12159)
(5, 0.10785)
(6, 0.09989)
(7, 0.09266)
(8, 0.08571)
(9, 0.08276)
(10, 0.07918)
(11, 0.07426)
(12, 0.06988)
(13, 0.06630)
(14, 0.06416)
(15, 0.06256)
(16, 0.05994)
(17, 0.05650)
(18, 0.05449)
(19, 0.05343)
(20, 0.05167)
Correlation for Graphlet ID2
(1, 0.22903)
(2, 0.07109)
(3, 0.05652)
(4, 0.04969)
(5, 0.04370)
(6, 0.03965)
(7, 0.03425)
(8, 0.03074)
(9, 0.03149)
(10, 0.02740)
(11, 0.02563)
(12, 0.02485)
(13, 0.02384)
(14, 0.02334)
(15, 0.02392)
(16, 0.02166)
(17, 0.02146)
(18, 0.01927)
(19, 0.01695)
(20, 0.01785)
Correlation for Graphlet ID3
(1, 0.17339)
(2, 0.05000)
(3, 0.03680)
(4, 0.02819)
(5, 0.02476)
(6, 0.02299)
(7, 0.01912)
(8, 0.01836)
(9, 0.01883)
(10, 0.01909)
(11, 0.01628)
(12, 0.01439)
(13, 0.0

({0: [0.39142001839743373,
   0.17593264259687133,
   0.14196374975659623,
   0.12159016915030119,
   0.10785090636902873,
   0.09988694193039964,
   0.09266101022449354,
   0.08571334498748197,
   0.08275839610359613,
   0.07918234931666317,
   0.07426328342609229,
   0.06988060640048994,
   0.06630050293035426,
   0.06416020222502394,
   0.06255629755145943,
   0.05993605121961238,
   0.056501114836516174,
   0.054489854879252606,
   0.05343040277987632,
   0.05167322135820997],
  1: [0.22902891415913676,
   0.07109007207760615,
   0.05651968443414175,
   0.04968661173896442,
   0.043700561788069754,
   0.03965291205050142,
   0.03425325108914372,
   0.030743127758902453,
   0.031492630241981526,
   0.027404228997660658,
   0.025628894422047518,
   0.024847202117417932,
   0.023837458160616293,
   0.02334082539714947,
   0.023919287066243928,
   0.02166340457158677,
   0.021459976573945595,
   0.019269244602105457,
   0.01694819313038314,
   0.017852439420729588],
  2: [0.17339281249

In [19]:
psrw_mixing_variance(G, k, steps_num=10**6) #ia-email-univ

Expectation
0.0763636666666
0.1548825
0.0305053333334
0.0011765
0.00243583333333
0.00047275
Normalized Variance
1.18253881644
2.22825367617
4.4635255037
69.8315625443
33.2114266165
175.273576591
Correlation for Graphlet ID1
(1, 0.20972)
(2, 0.05611)
(3, 0.04063)
(4, 0.03096)
(5, 0.02502)
(6, 0.02278)
(7, 0.01800)
(8, 0.01503)
(9, 0.01313)
(10, 0.01266)
(11, 0.00888)
(12, 0.01056)
(13, 0.00947)
(14, 0.00868)
(15, 0.00783)
(16, 0.00693)
(17, 0.00725)
(18, 0.00638)
(19, 0.00437)
(20, 0.00466)
Correlation for Graphlet ID2
(1, 0.15503)
(2, 0.02903)
(3, 0.02169)
(4, 0.01373)
(5, 0.01221)
(6, 0.01079)
(7, 0.00835)
(8, 0.00543)
(9, 0.00526)
(10, 0.00650)
(11, 0.00500)
(12, 0.00592)
(13, 0.00405)
(14, 0.00506)
(15, 0.00486)
(16, 0.00551)
(17, 0.00328)
(18, 0.00347)
(19, 0.00227)
(20, 0.00276)
Correlation for Graphlet ID3
(1, 0.23405)
(2, 0.05166)
(3, 0.03804)
(4, 0.02796)
(5, 0.02134)
(6, 0.01936)
(7, 0.01810)
(8, 0.01466)
(9, 0.01344)
(10, 0.01131)
(11, 0.00842)
(12, 0.00866)
(13, 0.00753)
(14

({0: [0.2097200068606207,
   0.05611040614028658,
   0.040626964458315665,
   0.030956168686815014,
   0.025023516800572836,
   0.02278470984530921,
   0.01800007526102055,
   0.015028123455103426,
   0.013127669364502673,
   0.01266125539836531,
   0.008875589442653625,
   0.010560240481839076,
   0.009469447618432,
   0.008680769777411218,
   0.007827638624373271,
   0.00693422339061504,
   0.0072452505699928255,
   0.006376002439974847,
   0.004366757954953403,
   0.004657642232133712],
  1: [0.15503271509181568,
   0.029033853499832823,
   0.02169137071862306,
   0.013731502699407651,
   0.012207245506058173,
   0.010790557621134301,
   0.008354267021636696,
   0.005426879372538523,
   0.005263636028961086,
   0.006498838593125589,
   0.005002627515706616,
   0.005915113282102409,
   0.004054085472939911,
   0.005064791140491936,
   0.004859452545517826,
   0.005505346789881678,
   0.0032794965118864644,
   0.0034717103010771444,
   0.0022701427740290797,
   0.002761691904881784],


In [21]:
psrw_mixing_variance(G, k, steps_num=10**6) #misc-as-caida k=4

Expectation
0.163633
0.0059635
0.000974
5.33333333333e-06
3e-05
5.83333333333e-07
Normalized Variance
0.0185394551551
82.8433805651
170.115674196
15624.0
2776.77777778
142856.142857
Correlation for Graphlet ID1
(1, 0.44518)
(2, 0.16562)
(3, 0.11296)
(4, 0.08341)
(5, 0.06522)
(6, 0.05330)
(7, 0.04753)
(8, 0.04261)
(9, 0.03774)
(10, 0.03499)
(11, 0.03443)
(12, 0.03454)
(13, 0.03448)
(14, 0.03219)
(15, 0.03023)
(16, 0.02748)
(17, 0.02832)
(18, 0.03017)
(19, 0.02837)
(20, 0.02804)
Correlation for Graphlet ID2
(1, 0.39014)
(2, 0.11996)
(3, 0.08331)
(4, 0.05989)
(5, 0.04673)
(6, 0.03765)
(7, 0.03451)
(8, 0.03163)
(9, 0.02628)
(10, 0.02357)
(11, 0.02535)
(12, 0.02467)
(13, 0.02340)
(14, 0.02230)
(15, 0.02111)
(16, 0.02043)
(17, 0.01924)
(18, 0.02111)
(19, 0.02136)
(20, 0.02068)
Correlation for Graphlet ID3
(1, 0.32391)
(2, 0.05677)
(3, 0.03767)
(4, 0.02493)
(5, 0.01787)
(6, 0.01650)
(7, 0.01495)
(8, 0.01271)
(9, 0.00961)
(10, 0.00944)
(11, 0.00910)
(12, 0.00875)
(13, 0.00892)
(14, 0.00961)
(1

({0: [0.4451808206755524,
   0.165615012137892,
   0.11295699260197355,
   0.08340940819185091,
   0.06522119422923159,
   0.05330022162924126,
   0.04753459340789984,
   0.042608322888323176,
   0.03773800052096126,
   0.034994074722922125,
   0.03443250999020945,
   0.03454244220685749,
   0.03448449995737529,
   0.03218822620695685,
   0.030227697907435364,
   0.027483748135183186,
   0.028321133835166476,
   0.030165774362292855,
   0.028373111891298698,
   0.028035369964847043],
  1: [0.3901438698138647,
   0.11996424415982729,
   0.08330674781668337,
   0.059886668310346065,
   0.04673409266966906,
   0.037654572762826764,
   0.03451495194382904,
   0.03162989298212936,
   0.02628400046911184,
   0.023568639936699772,
   0.025350657878736544,
   0.0246718442211301,
   0.02339903391718761,
   0.022295934176932968,
   0.021107975590919432,
   0.020429154975271705,
   0.01924119214649391,
   0.02110807512944976,
   0.021362679245976155,
   0.020683856169702863],
  2: [0.323906541588

In [12]:
#Garbage code

def fullPSRW(G, N, time_limit=None, time_step=10, query_limit=None, 
             epoch_num=1, ground_truth=None):

    assert (time_limit is None) != (query_limit is None)
    norm_error = 0

    for epoch in range(epoch_num):
        init_graphlet = lift(G, random.choice(list(G.nodes())), N-1)
        old_graphlet = init_graphlet
        type_counter = {i:0 for i in range(len(cached_graphlet_list[N]))}
        t0 = time.time()
        iter_count = 0
        query_count = 0
        time_iter_count = 1
        stop_condition = False
        #print('Starting epoch {}'.format(epoch+1))
        type_list = []
        
        while not stop_condition:
            new_graphlet = SRW_step(G, old_graphlet)[0]
            T = old_graphlet.union(new_graphlet)
            old_graphlet = new_graphlet
            assert len(T)==N
            T_type = find_type(subgraph(G, T))
            type_counter[T_type] += (cached_sub_edge_num[T_type])**(-1)
            
            type_list.append(T_type)
            iter_count += 1
            curr_time = time.time()

            if curr_time - t0 > time_iter_count*time_step:
                print("Time is {} NMSE is {}"
                      .format(int(curr_time-t0), 
                              NMSE(type_counter, ground_truth)))
                print("Number of iterations is {}".format(iter_count))
                time_iter_count += 1

            if time_limit is not None:
                stop_condition = (time.time()-t0 > time_limit)
            if query_limit is not None:
                stop_condition = (iter_count > query_limit)
        
        print(type_list)

        if ground_truth is not None:
            error = NMSE(type_counter, ground_truth)
            print("NMSE error is {}".format(error))
            norm_error += error
            
    norm_error = norm_error*(epoch_num)**(-1)
    return {'ratio': normalize(type_counter), 
            'NMSE': norm_error,
            'type_list': type_list
           }

def fullSRW(G, running_time=120, time_step=10, N=3):
    t0 = time.time()
    v0 = next(iter(G.nodes()))
    old_graphlet = set(list(G.neighbors(v0))[:(N-1)]+[v0])
    type_counter = {i:0 for i in range(len(cached_graphlet_list[N]))}
    time_iter_count = 1
    iter_count = 0
    curr_time = time.time()
    while curr_time - t0 < running_time:
        iter_count += 1
        (new_graphlet, old_graphlet_degree) = SRW_step(G, old_graphlet)
        type_counter[find_type(subgraph(G, old_graphlet))] += old_graphlet_degree**(-1)
        old_graphlet = new_graphlet
        curr_time = time.time()
        if curr_time - t0 > time_iter_count*time_step:
            total_count = sum((val for i,val in type_counter.items()))
            print("Time is {} Type counter is {}"
                  .format(int(curr_time-t0),
                          {i: val/total_count for i,val in type_counter.items()}))
            print("Number of iterations is {}".format(iter_count))
            time_iter_count += 1
    total_count = sum((val for i,val in type_counter.items()))
    #print("Time spent {} sec".format(time.time()-t0))
    return {i: val/total_count for i,val in type_counter.items()}

def brute_force(G, N=3):
    type_counter = {i:0 for i in range(len(cached_graphlet_list[N]))}
    if N==3:
        percent_count = 0
        counter = 0
        for u,v in G.edges():
            for w in set(G.neighbors(u))-{v}:
                if w in G.neighbors(v): 
                    type_counter[1] += 1
                else:
                    type_counter[0] += 1
            for w in set(G.neighbors(v))-{u}:
                if w in G.neighbors(u): 
                    type_counter[1] += 1
                else:
                    type_counter[0] += 1
            counter += 1
            if counter > percent_count*0.00001*G.number_of_edges():
                clear_output()
                #print("{}% complete".format(percent_count*0.001))
                percent_count += 1
        type_counter[0] = type_counter[0]/2
        type_counter[1] = type_counter[1]/6
        
    if N==4:
        for u,v in G.edges():
            neigh = set(G.neighbors(u)).union(set(G.neighbors(v)))-{u,v}
            for w,z in itertools.combinations(neigh, 2):
                T = subgraph(G, {u,v,w,z})
                type_counter[find_type(T)] += 1
        type_counter[0] = type_counter[0]/3
        type_counter[1] = type_counter[1]
        type_counter[2] = type_counter[2]/3
        type_counter[3] = type_counter[3]/4
        type_counter[4] = type_counter[4]/5
        type_counter[5] = type_counter[5]/6
    return type_counter

def NMSE(dict_hat, dict_true):
    norm_dict_hat = normalize(dict_hat)
    norm_dict_true = normalize(dict_true)
    return sum(((norm_dict_hat[i]*freq**(-1) - 1)**2
                for i, freq in norm_dict_true.items() if freq != 0))

def normalize(dict_hat):
    total_count = sum((val for i,val in dict_hat.items()))
    return {i: val*(total_count)**(-1) for i,val in dict_hat.items()}

def scale(dict_hat, scalar):
    return {i: int(val*scalar) for i,val in dict_hat.items()}
