In [2]:
import pandas as pd
import numpy as np
import random as rand
import tqdm
import networkx as nx
import pickle as pkl
import gower

In [8]:
with open('graph_349519.pkl', 'rb') as input:
    graph = pkl.load(input)

# Extracting data
edge_index = graph['edge_index']
num_nodes = graph['num_nodes']

# Create a networkx graph
G = nx.Graph()

# Add nodes
G.add_nodes_from(range(num_nodes))

# Add edges
edges = list(zip(edge_index[0], edge_index[1]))
G.add_edges_from(edges)

In [116]:
# Create a lookup table for gower distances
attributes = graph['node_feat']
lu = pd.DataFrame(attributes)

In [440]:
def prototype(G):
    prot = rand.choice([n for n in G])
    return prot

def getAttributes(x):
    return [float(i) for i in attributes[x]]

def ranking(x, S: nx.Graph):
    D = nx.shortest_path_length(S, x)
    distances = list(D.values())
    D = list(zip(D.keys(),D.values()))


    for d in np.unique(distances):
        left, right = distances.index(d), len(distances) - distances[::-1].index(d)
        tielist = D[left:right]
        gowerlist = gower.gower_matrix(pd.DataFrame([getAttributes(x)]), pd.DataFrame([getAttributes(i[0]) for i in tielist]))
        sorted_gower = [x for _,x in sorted(zip(gowerlist[0],tielist))]
        D[left:right] = sorted_gower

    ranks = [(x[0], lu.loc[x[0]]['target']) for x in D]

    return ranks

def Q(S, G, target):
    S_size = len(S)
    G_size = len(G)
    cover = S_size / G_size
    
    n_target_S = lu.loc[S][target].sum()
    n_target_G = lu.loc[G][target].sum()

    
    WRAcc = (cover**0.5)* ((n_target_S/S_size) - (n_target_G/G_size))
    RAcc = ((n_target_S/S_size) - (n_target_G/G_size))

    return abs(WRAcc)

def Discovery(ranks):
    rho = 0
    sigma = 0
    best = 0

    for i in range(5,len(ranks)):
        q = Q([x[0] for x in ranks[0:i]], list(G.nodes), 'target')
        if q >= best:
            best = q
            rho = i
    
    best = 0

    for i in range(3,rho):
        q = Q([x[0] for x in ranks[0:i]], [x[0] for x in ranks[0:rho]], 'target')
        if q >= best:
            best = q
            sigma = i

    return rho, sigma, best, ranks

In [441]:
lu['target'] = lu[0] >= 6

In [442]:
out = pd.DataFrame(columns= ['rho', 'sigma', 'q', 'ranks'])
for node in tqdm.tqdm(list(G.nodes)):
    rho, sigma, q, ranks = Discovery(ranking(node, G))
    out.loc[node] = [rho, sigma, q, ranks] 

100%|██████████| 332/332 [01:55<00:00,  2.87it/s]


In [446]:
out.sort_values(by=['q'], ascending= False)

Unnamed: 0,rho,sigma,q,ranks
46,9,3,0.320750,"[(46, False), (44, False), (47, False), (45, T..."
28,14,5,0.230508,"[(28, True), (27, False), (26, True), (29, Fal..."
27,14,5,0.230508,"[(27, False), (26, True), (28, True), (29, Fal..."
34,13,5,0.228987,"[(34, False), (33, True), (35, True), (36, Fal..."
35,13,5,0.228987,"[(35, True), (34, False), (33, True), (36, Fal..."
...,...,...,...,...
104,5,4,0.000000,"[(104, False), (103, False), (105, False), (10..."
105,6,5,0.000000,"[(105, False), (104, False), (106, False), (10..."
106,6,5,0.000000,"[(106, False), (104, False), (105, False), (10..."
111,5,4,0.000000,"[(111, False), (112, False), (110, False), (11..."


In [439]:
out[out['rho']>10].sort_values(by=['rho'], ascending = False)

Unnamed: 0,rho,sigma,q,ranks
6,181,14,0.043792,"[(6, False), (5, True), (7, True), (8, False),..."
92,169,1,0.051889,"[(92, True), (91, False), (93, True), (84, Fal..."
88,163,1,0.052858,"[(88, True), (87, False), (89, False), (86, Fa..."
93,162,1,0.052863,"[(93, True), (91, False), (94, False), (99, Fa..."
72,146,1,0.056118,"[(72, True), (70, False), (73, False), (81, Fa..."
...,...,...,...,...
27,14,5,0.230508,"[(27, False), (26, True), (28, True), (29, Fal..."
34,13,5,0.228987,"[(34, False), (33, True), (35, True), (36, Fal..."
258,11,1,0.164461,"[(258, False), (259, True), (260, True), (257,..."
38,11,1,0.164461,"[(38, False), (39, True), (40, True), (37, Fal..."
