In [2]:
import matplotlib.pyplot as plt
import networkx as nx
import networkx.algorithms.isomorphism as iso
import sympy
import numpy as np
import random
import time
import itertools
import math
from IPython.display import clear_output

In [51]:
def graphlet_list(k):
    assert k > 0
    foo = 1
    loc_graphlet_list = {n: [] for n in range(1,k+1)}
    while True:
        G = nx.graph_atlas(foo)
        n = G.number_of_nodes()
        if n>k:
            break
        if nx.is_connected(G):
            loc_graphlet_list[n].append(G)
        foo += 1
    return loc_graphlet_list
    

def find_type_match(T):
    n = T.number_of_nodes()
    if n==1:
        return((0, {u: 0 for u in T.nodes()}))
    if n==2:
        return((0, {u: i for i,u in enumerate(T.nodes())}))
    if n==3:
        if T.number_of_edges()==2:
            u0 = next((node for node in T.nodes() if T.degree(node)==2))
            (u1,u2) = (node for node in T.neighbors(u0))
            return((0, {u0: 0, u1: 1, u2: 2}))
        if T.number_of_edges()==3:
            return((1,{u:i for i,u in enumerate(T.nodes())}))
    if n==4:
        e_num = T.number_of_edges()
        max_degree = max((T.degree(node) for node in T.nodes()))
        if e_num==3 and max_degree==3:
            u3 = next((node for node in T.nodes() if T.degree(node)==3))
            (u0,u1,u2) = (node for node in T.neighbors(u3))
            return((0, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==3 and max_degree==2:
            (u0,u1) = (node for node in T.nodes() if T.degree(node)==2)
            u2 = next((node for node in T.neighbors(u1) if node!=u0))
            u3 = next((node for node in T.neighbors(u0) if node!=u1))
            return((1, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==4 and max_degree==3:
            u3 = next((node for node in T.nodes() if T.degree(node)==3))
            (u1,u2) = (node for node in T.nodes() if T.degree(node)==2)
            u0 = next((node for node in T.nodes() if T.degree(node)==1))
            return((2, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==4 and max_degree==2:
            u0 = next((node for node in T.nodes()))
            (u1,u3) = (node for node in T.neighbors(u0))
            u2 = next((node for node in T.neighbors(u1) if node!=u0))
            return((3, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==5:
            (u0,u2) = (node for node in T.nodes() if T.degree(node)==3)
            (u1,u3) = (node for node in T.nodes() if T.degree(node)==2)
            return((4, {u0:0, u1:1, u2:2, u3:3}))
        if e_num==6:
            (u0,u1,u2,u3) = (node for node in T.nodes())
            return((5, {u0:0, u1:1, u2:2, u3:3}))
    # Improve matching procedure here for n>4.
    GM = next((i, iso.GraphMatcher(T,T_)) 
              for (i,T_) in enumerate(cached_graphlet_list[n]) 
              if iso.GraphMatcher(T,T_).is_isomorphic())
    assert GM[1].is_isomorphic()
    return((GM[0],GM[1].mapping))

def find_type(T):
    n = T.number_of_nodes()
    if n==1:
        return 0
    if n==2:
        return 0
    if n==3:
        if T.number_of_edges()==2:
            return 0
        if T.number_of_edges()==3:
            return 1
    if n==4:
        e_num = T.number_of_edges()
        max_degree = max((T.degree(node) for node in T.nodes()))
        if e_num==3 and max_degree==3:
            return 0
        if e_num==3 and max_degree==2:
            return 1
        if e_num==4 and max_degree==3:
            return 2
        if e_num==4 and max_degree==2:
            return 3
        if e_num==5:
            return 4
        if e_num==6:
            return 5
    # Improve matching procedure here at least for n=4.
    GM = next((i 
              for (i,T_) in enumerate(cached_graphlet_list[n]) 
              if iso.GraphMatcher(T,T_).is_isomorphic()))
    return GM  

def subgraph(G, nodes):
    list_nodes = list(nodes)
    T = nx.Graph()
    T.add_nodes_from(nodes)
    for i in range(len(nodes)):
        for j in range(i):
            if list_nodes[i] in G.neighbors(list_nodes[j]):
                T.add_edge(list_nodes[i],list_nodes[j])
    return T

def random_walk_nodes(G, v0, steps_num):
    curr_vert = v0
    for _ in range(steps_num):
        curr_vert = random.choice(list(G.neighbors(curr_vert)))
    return curr_vert

def load_graph(name, k=3):
    
    ground_truth = None
    G = None

    if name=='com-amazon':
        G = nx.read_edgelist(
            'Graphs/com-amazon.ungraph.txt',
            create_using = nx.Graph())
        
        if k==3:
            ground_truth = {0: 7750799, 
                            1: 667129}
        if k==4:
            ground_truth = {0: 124295537, 
                            1: 37383434, 
                            2: 13674662, 
                            3: 422515, 
                            4: 1874925, 
                            5: 275961}

    if name=='com-dblp':
        G = nx.read_edgelist(
            'Graphs/com-dblp.ungraph.txt',
            create_using = nx.Graph())
        if k==3:
            ground_truth = {0: 15107734, 
                            1: 2224385}
        if k==4:
            ground_truth = {0: 258570802, 
                            1: 252447350, 
                            2: 96615211, 
                            3: 203394, 
                            4: 4764685, 
                            5: 16713192}

    if name=='com-lj':
        G = nx.read_edgelist(
            'Graphs/com-lj.ungraph.txt',
            create_using = nx.Graph())
        if k==3:
            ground_truth = {0: 3722307805, 
                            1: 177820130}
        if k==4:
            ground_truth = {0: 1983908933796,
                            1: 542683013686,
                            2: 57662704306,
                            3: 2541452010,
                            4: 8190586835,
                            5: 521691844}

    if name=='com-youtube':
        G = nx.read_edgelist(
            'Graphs/com-youtube.ungraph.txt',
            create_using = nx.Graph())
        if k==3:
            ground_truth = {0: 1465313402, 
                            1: 3056386}
        if k==4:
            ground_truth = {0: 5730407268993,
                            1: 91488735459,
                            2: 12371157628,
                            3: 231979854,
                            4: 221833272,
                            5: 4986965}

    if name=='misc-net25':
        G = nx.read_edgelist(
            'Graphs/misc-net25.mtx',
            create_using = nx.Graph())
        for v in G.nodes():
            G.remove_edge(v,v)
        if k==3:
            ground_truth = {0: 12690840, 
                            1: 64090}
        if k==4:
            ground_truth = {0: 361490550,
                            1: 550792350,
                            2: 12554670,
                            3: 44915955,
                            4: 0,
                            5: 0}

    if name=='bio-celegansneural':
        G = nx.read_edgelist(
            'Graphs/bio-celegansneural.mtx',
            create_using = nx.Graph(), data=(('weight',float),))

    if name=='bio-yeast':
        G = nx.read_edgelist(
            'Graphs/bio-yeast.mtx',
            create_using = nx.Graph())
    
    if name=='bn-macaque-rhesus_brain_1':
        G = nx.read_edgelist(
            'Graphs/bn-macaque-rhesus_brain_1.edges',
            create_using = nx.Graph())
    
    if name=='bn-mouse_brain_1':
        G = nx.read_edgelist(
            'Graphs/bn-mouse_brain_1.edges',
            create_using = nx.Graph())
    
    if name=='ia-email-univ':
        G = nx.read_edgelist(
            'Graphs/ia-email-univ.mtx',
            create_using = nx.Graph())

    if name=='misc-polblogs':
        G = nx.read_edgelist(
            'Graphs/misc-polblogs.mtx',
            create_using = nx.Graph(), data=(('weight',float),))
        

    if name=='misc-as-caida':
        G = nx.read_edgelist(
            'Graphs/misc-as-caida.mtx',
            create_using = nx.Graph(), data=(('weight',float),)) 
        if k==3:
            ground_truth = {0: 59513652, 
                            1: 72730}
        if k==4:
            ground_truth = {0: 62565214368,
                            1: 2808802860,
                            2: 203097552,
                            3: 3774144,
                            4: 4084544,
                            5: 0}

    if name=='misc-fullb':
        G = nx.read_edgelist(
            'Graphs/misc-fullb.mtx',
            create_using = nx.Graph())
        for v in G.nodes():
            G.remove_edge(v,v)
        if k==3:
            ground_truth = {0: 162067420, 
                            1: 60212260}
        if k==4:
            ground_truth = {0: 1078734774,
                            1: 4837795036,
                            2: 2707584768,
                            3: 64898820,
                            4: 897215295,
                            5: 370980150}

    if name=='misc-neos3':
        G = nx.read_edgelist(
            'Graphs/misc-neos3.mtx',
            create_using = nx.Graph(), data=(('weight',float),))

        if k==3:
            ground_truth = {0: 207426691, 
                            1: 505603}
        if k==4:
            ground_truth = {0: 59618248397,
                            1: 11164704825,
                            2: 120388385,
                            3: 2047846,
                            4: 499122,
                            5: 0}

    if name=='misc-discogs_affiliation':
        G = nx.read_edgelist(
            'Graphs/misc-discogs_affiliation.edges',
            create_using = nx.Graph())
        if k==3:
            ground_truth = None
        if k==4:
            ground_truth = {0: 208345722513295,
                            1: 851118877585,
                            2: 58223406336,
                            3: 3008868833,
                            4: 439215089,
                            5: 654413}

    if name=='misc-amazon-ratings':
        G = nx.read_edgelist(
            'Graphs/misc-amazon-ratings.edges',
            create_using = nx.Graph())
        if k==3:
            ground_truth = {0: 699425719, 
                            1: 79638}
        if k==4:
            ground_truth = {0: 719668204837,
                            1: 40966346985,
                            2: 184396006,
                            3: 37045086,
                            4: 561566,
                            5: 671}

    if name=='misc-dbpedia-all':
        G = nx.read_edgelist(
            'Graphs/misc-dbpedia-all.edges',
            create_using = nx.Graph())
        if k==3:
            ground_truth = {0: 174250340949, 
                            1: 8329548}
        if k==4:
            ground_truth = {0: 19646604300441472,
                            1: 1652259549599,
                            2: 622928133900,
                            3: 15925209557,
                            4: 15630164176,
                            5: 4609834}
            
    if G is None:
        raise KeyError

    return {'graph': G, 'ground_truth': ground_truth}

def waddling_mixing(G, k, steps_num=1000, burn_in_limit=20):
    assert k==4
    v0 = random.choice(list(G.nodes()))
    graphlet_num = len(cached_graphlet_list[k])
    type_counter = {i:0 for i in range(graphlet_num)}
    pair_counter = {burn_in: 
                    {i: [0]*graphlet_num 
                     for i in range(graphlet_num)} 
                    for burn_in in range(0,burn_in_limit)}
    memory = [None for _ in range(burn_in_limit)]
    flag = True
    for _ in range(steps_num):
        v1 = random_walk_nodes(G, v0, 1)
        v2 = random_walk_nodes(G, v1, 1)
        v3 = random_walk_nodes(G, v2, 1)
        T = {v0, v1, v2, v3}
        if len(T)==4:
            T_type = find_type(subgraph(G,T))
            assert T_type != 0
            type_counter[T_type] += 1
            ind = 0
            while ind < burn_in_limit and memory[ind] is not None:
                pair_counter[ind][memory[ind]][T_type] += 1
                ind+=1
            memory = [T_type] + memory[:-1]
        if False and len({v1,v2,v3})==3:
            v4 = random.choice(list(G.neighbors(v2)))
            T = {v1,v2,v3,v4}
            if len(T)==4 and find_type(subgraph(G,T))==0:
                type_counter[0] += 1
                ind = 0
                while ind < burn_in_limit and memory[ind] is not None:
                    pair_counter[ind][memory[ind]][0] += 1
                    ind+=1
            memory = [0] + memory[:-1]
        flag = not flag
        v0 = random_walk_nodes(G, v3, 1)
    total_counter = sum((type_counter[i] for i in range(graphlet_num)))
    type_prob = [type_counter[i]*total_counter**(-1) 
                 for i in range(graphlet_num)]
    beta_coeff = [0]*burn_in_limit
    for burn_in in range(burn_in_limit):
        TV = {}
        for i in range(graphlet_num):
            cond_counter = pair_counter[burn_in][i] 
            cond_total = sum(cond_counter)
            if cond_total==0:
                TV[i] = 0 if type_prob==0 else 1
                continue
            cond_prob = [foo*cond_total**(-1) 
                         for foo in cond_counter]
            TV[i] = sum((abs(cond_prob[j] - type_prob[j]) 
                        for j in range(graphlet_num)))/2
        beta_coeff[burn_in] = sum((TV[i]*type_prob[i] 
                                   for i in range(graphlet_num)))
    return beta_coeff


def waddling_variance(G, k, steps_num=1000, burn_in=3):
    assert k==4
    v0 = random.choice(list(G.nodes()))
    graphlet_num = len(cached_graphlet_list[k])
    variance_counter = {i:0 for i in range(graphlet_num)}
    expectation_counter = {i:0 for i in range(graphlet_num)}
    longest_paths = {1:1, 2:2, 3:4, 4:6, 5:12}
    sample_counter = 0
    for _ in range(steps_num):
        v1 = random_walk_nodes(G, v0, 1)
        v2 = random_walk_nodes(G, v1, 1)
        v3 = random_walk_nodes(G, v2, 1)
        T = {v0, v1, v2, v3}
        if len(T)==4:
            T_type = find_type(subgraph(G,T))
            assert T_type != 0
            T_prob = (longest_paths[T_type] *
                      (G.degree(v1)*G.degree(v2))**(-1) * 
                      (2*G.number_of_edges())**(-1))
            expectation_counter[T_type] += (T_prob)**(-1)
            variance_counter[T_type] += (T_prob)**(-2)
            sample_counter += 1
        v0 = random_walk_nodes(G, v3, burn_in)
    norm_variance = {i: (variance_counter[i]**(0.5)
                         * expectation_counter[i]**(-1) 
                         * sample_counter**(0.5))
                     for i in range(graphlet_num) 
                     if expectation_counter[i]!=0}
    return norm_variance

def waddling_count(G, k, steps_num=1000, burn_in=3):
    assert k==4
    v0 = random.choice(list(G.nodes()))
    graphlet_num = len(cached_graphlet_list[k])
    expectation_counter = {i:0 for i in range(graphlet_num)}
    longest_paths = {1:2, 2:4, 3:8, 4:12, 5:24}
    sample_counter = 0
    for _ in range(steps_num):
        v1 = random_walk_nodes(G, v0, 1)
        v2 = random_walk_nodes(G, v1, 1)
        v3 = random_walk_nodes(G, v2, 1)
        T = {v0, v1, v2, v3}
        if len(T)==4:
            T_type = find_type(subgraph(G,T))
            assert T_type != 0
            T_prob = (longest_paths[T_type] *
                      (G.degree(v1)*G.degree(v2))**(-1) * 
                      (2*G.number_of_edges())**(-1))
            expectation_counter[T_type] += (T_prob)**(-1)
            sample_counter += 1
        v0 = random_walk_nodes(G, v3, burn_in)
    expectation_counter = {i: expectation_counter[i] * sample_counter**(-1)
                           for i in range(graphlet_num)}
    return expectation_counter

In [4]:
k=4
cached_graphlet_list = graphlet_list(k)

In [59]:
G = load_graph('misc-as-caida',4)['graph']
print(G.number_of_nodes(), G.number_of_edges())

(26475, 53381)


In [60]:
waddling_count(G, 4, steps_num=10**4, burn_in=3)

{0: 0.0,
 1: 581236701.9660378,
 2: 90305675.10433961,
 3: 623213.1031132075,
 4: 2447970.406257862,
 5: 150350.60806603773}

In [61]:
waddling_variance(G,4, steps_num=10**4, burn_in=3)

{1: 8.177056443884464,
 2: 14.905820512505171,
 3: 11.520318481331508,
 4: 7.310441035294313,
 5: 34.5947926023176}

In [29]:
waddling_mixing(G,4, steps_num=10**6, burn_in_limit=20)

[0.007192884611618636,
 0.002357923063359294,
 0.001426862021167304,
 0.0010576847372578608,
 0.0011303077201153318,
 0.0011225871829868015,
 0.0009632628254089391,
 0.0009697538728659857,
 0.0014345224584101125,
 0.0006599848586401661,
 0.001576815828207412,
 0.001447363337659622,
 0.001629063128535375,
 0.0012136000233701183,
 0.001395930394694255,
 0.0008592706135265291,
 0.0014456472428648144,
 0.0010661317838560555,
 0.0012685791768335328,
 0.0008796597181606179]