In [36]:
#Convert the current code to python script
#jupyter nbconvert --to script KNNWeights.ipynb

# Test Dataset to check

In [37]:
import multiprocessing
NUM_PROCESSORS=multiprocessing.cpu_count()
# print("Cpu count: ",NUM_PROCESSORS)

In [38]:
#as it turned out interactive shell (like Jupyter cannot handle CPU multiprocessing well so check which medium the code is runing)
#we will write code in Jupyter for understanding purposes but final execuation will be in shell
from ipynb.fs.full.Utils import isnotebook
from ipynb.fs.full.Dataset import generate_synthetic
import networkx as nx
from torch_geometric.utils import to_networkx, from_networkx
import torch_geometric.utils.homophily as homophily
import copy
import ipynb.fs.full.utils.MoonGraph as MoonGraph

In [39]:
import torch
import torch.nn as nn
from torch_sparse import SparseTensor
from tqdm import tqdm
import math
import time
import torch.nn.functional as F

import random
random.seed(12345)
import numpy as np
np.random.seed(12345)

In [40]:
import sklearn
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from multiprocessing.pool import ThreadPool, Pool

In [41]:
from ipynb.fs.full.Dataset import get_data
from ipynb.fs.full.Dataset import datasets as available_datasets

In [42]:
import argparse
from argparse import ArgumentParser

#set default arguments here
def get_configuration():
    parser = ArgumentParser()    
    parser.add_argument('--log_info', type=bool, default=True)
    parser.add_argument('--pbar', type=bool, default=False)
    parser.add_argument('--num_worker', type=int, default=0)
    parser.add_argument('--dataset', type=str, default="karate", choices=available_datasets)
    parser.add_argument('--f') ##dummy for jupyternotebook
    args = parser.parse_args()
    
    dict_args = vars(args)
    
    return args, dict_args

args, dict_args = get_configuration()

## KNN Ranking

In [43]:
class KNNWeight():
    
    def __init__(self, data, metric='cosine', log = False, lambda1=0.25, lambda2=0.25, w1=1.0, w2=0.5, w3=0.1):
        
        self.N = N = data.num_nodes
        self.E = E = data.num_edges
        self.data = data
        self.log = log
        self.lambda1=lambda1
        self.lambda2=lambda2
        self.w1=w1
        self.w2=w2
        self.w3=w3
        
        
        self.metric = metric
        self.sign = 1
        
        if self.log:
            print("Metric: ", metric)
        
        if metric=='cosine':
            self.sign = -1
            self.sim_func = cosine_similarity
        elif metric=='euclidean':
            self.sign = 1
            self.sim_func = euclidean_distances
        else:
            print('error')
            raise 'Not defined error'

        self.adj = SparseTensor(
            row=data.edge_index[0], col=data.edge_index[1],
            value=torch.arange(E, device=data.edge_index.device),
            sparse_sizes=(N, N))
        
    def node_weight(self,u):
    
        row, col, edge_index = self.adj[u,:].coo()   
        
        if len(col)==0:
            return [],[]
        
        lambda1 = self.lambda1 #top 25% with probability 1
        lambda2 = self.lambda2 #second 25% with probability 0.5 
        
        l1=math.ceil(len(col)*lambda1)
        l2=min(len(col)-l1,math.ceil(len(col)*lambda2))        
        l3=max(0,int(len(col)-l1-l2))
        #print(len(col),l1, l2, l3)
                        
        target_class_sim = self.sim_func(self.data.x[u].view(1,-1), self.data.x[col.tolist()])        
        #print(target_class_sim)
        ind = np.argsort(self.sign*target_class_sim[0]) #-1*desending, normal will be ascending    
        
#         if len(col)>=10:
#             ind=np.argpartition(self.sign*target_class_sim[0], kth=[l1,l2], axis=-1, kind='introselect', order=None)
#         else:
#             ind = np.argsort(self.sign*target_class_sim[0]) #-1*desending, normal will be ascending    
                        
#         print(u, row, col, edge_index)
#         print(target_class_sim)
#         print(ind)
#         S_G = np.ones(l1, dtype=float)*1.0
#         S_G = np.append(S_G, np.ones(l2, dtype=float)*0.5)
#         if(l3>0):
#             S_G = np.append(S_G, np.ones(l3, dtype=float)*0.1)

        S_G = np.ones(l1, dtype=float)*self.w1
        S_G = np.append(S_G, np.ones(l2, dtype=float)*self.w2)
        
        if(l3>0):
            S_G = np.append(S_G, np.ones(l3, dtype=float)*self.w3)
        
        S_G = S_G.tolist()
        
#         S_G = list(range(1,len(col)+1))
        S_edge = edge_index[ind].tolist()
    
        #print(S_G, S_edge)
        
        return S_G, S_edge

    def get_knn_weight(self):
        
        if self.log:
            pbar = tqdm(total=self.N)
            pbar.set_description(f'Nodes')

        edge_weight=[]
        edge_index=[]

        for u in range(self.N):            
            weight, e_index = self.node_weight(u)
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            
            if self.log:
                pbar.update(1)
        
        if self.log:
            pbar.close()
        
        assert len(edge_index)==self.E
        
        weight=torch.zeros(len(edge_index))        
        weight[edge_index]=torch.Tensor(edge_weight)
        
        return weight
    
    def process_block(self, list_u):
        
        #print("Processing :",len(list_u), list_u[0], list_u[-1])
        
        edge_weight = []
        edge_index = []
        
        for u in list_u:        
            weight, e_index = self.node_weight(u)            
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            
        #print("Done :",len(list_u), list_u[0], list_u[-1])
            
        return edge_weight, edge_index, len(list_u)
    
    #multiprocessing
    def get_knn_weight_multiproces(self):
        
        edge_weight=[]
        edge_index=[]        
        
        N = self.N
        #N = 1000
        
        #elem_size=1000
        #num_blocks = int(N/elem_size)
        num_blocks = NUM_PROCESSORS
        elem_size = int(N/num_blocks)
        
        
        nodes = np.arange(num_blocks*elem_size).reshape(num_blocks,-1).tolist()
        
        if num_blocks*elem_size<N:
            nodes.append(list(range(num_blocks*elem_size,N)))        
        
        pool_size = NUM_PROCESSORS        
        if self.log:
            print("Pool Size: ", pool_size)        
        pool = Pool(pool_size)
        
        if self.log:
            pbar = tqdm(total=N)
            pbar.set_description(f'Nodes')  
                
        for (weight, e_index, num_el) in pool.imap_unordered(self.process_block, nodes):            
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            
            if self.log:
                pbar.update(num_el)
        
        if self.log:
            pbar.close()
        
        assert len(edge_index)==self.E        
        
        weight=torch.zeros(len(edge_index))        
        weight[edge_index]=torch.Tensor(edge_weight)
        
        return weight
    
    
    def compute_weights(self):   
        #if isnotebook():
        #weight = self.get_knn_weight()
        
        if self.data.num_nodes<10000:
            weight = self.get_knn_weight()    
        else:
            weight = self.get_knn_weight_multiproces()
        
        return weight

In [44]:
# data, dataset = get_data('Cora', log=False, h_score=False)

In [45]:
# knn_weight = KNNWeight(data, 'cosine', log = True)
# knn_weight.node_weight(3)

In [46]:
def fun():
    
    for d in range(10,101, 10):    
        #d = 25
        h=0.25

        data, dataset = get_data('Cora', log=False)
        data = generate_synthetic(data, d=d, h=h, train=0.1, random_state=None, log=False)

        Nh = homophily(data.edge_index, data.y, method='node')
        Eh = homophily(data.edge_index, data.y, method='edge')
        Einh = homophily(data.edge_index, data.y, method='edge_insensitive')
        E = len(data.edge_index[0])

        knn_weight = KNNWeight(data, 'cosine')
        data.weight = knn_weight.compute_weights()

        if 'weight' in data:
            cp_data= copy.deepcopy(data)
            G = to_networkx(cp_data, to_undirected=True, edge_attrs=['weight'])
            to_remove = [(a,b) for a, b, attrs in G.edges(data=True) if attrs["weight"] <1.0 ]
            G.remove_edges_from(to_remove)
            updated_data = from_networkx(G)

            up_E = len(updated_data.edge_index[0])

            up_Nh = homophily(updated_data.edge_index, cp_data.y, method='node')
            up_Eh = homophily(updated_data.edge_index, cp_data.y, method='edge')
            up_Einh = homophily(updated_data.edge_index, cp_data.y, method='edge_insensitive')


        print('{} {} {} {} {} {} {} {} {} {}'.format(d, h, Nh, Eh, Einh, E, up_Nh, up_Eh, up_Einh, up_E))

    return
#fun()

# Main

In [47]:
if __name__ == '__main__':  
    
    args.dataset = 'karate'
    
    data, dataset = get_data(args.dataset)
#     data = generate_synthetic(data, d=100, h=0.25, train=0.1, random_state=None, log=False)
    
    print(data)
    
#     print("Node Homophily:", homophily(data.edge_index, data.y, method='node'))
#     print("Edge Homophily:", homophily(data.edge_index, data.y, method='edge'))
#     print("Edge_insensitive Homophily:", homophily(data.edge_index, data.y, method='edge_insensitive'))    
    
    knn_weight = KNNWeight(data, 'cosine', log = True)
    #S_G, S_edge = knn_weight.lazy_greedy_weight(0); print(S_G); print(S_edge);
    #S_G, S_edge = knn_weight.process_block([0,1,2]); print(S_G); print(S_edge);

    start = time.time()    
    data.weight = knn_weight.compute_weights()
    end = time.time()
    print("Execution time: ", end-start)
    
#     if 'weight' in data:
#         cp_data= copy.deepcopy(data)
#         G = to_networkx(cp_data, to_undirected=True, edge_attrs=['weight'])
#         to_remove = [(a,b) for a, b, attrs in G.edges(data=True) if attrs["weight"] <1.0 ]
#         G.remove_edges_from(to_remove)
#         updated_data = from_networkx(G)
        
#         updated_data = from_networkx(G, group_edge_attrs=['weight'])
#         updated_data.weight = updated_data.edge_attr.view(-1)
        
#         row, col = updated_data.edge_index
#         updated_data.edge_index = torch.stack((torch.cat((row, col),dim=0), torch.cat((col, row),dim=0)),dim=0)
#         updated_data.weight = torch.cat((updated_data.weight, updated_data.weight),dim=0)
        
        
#         print(updated_data)

#         print("Node Homophily:", homophily(updated_data.edge_index, cp_data.y, method='node'))
#         print("Edge Homophily:", homophily(updated_data.edge_index, cp_data.y, method='edge'))
#         print("Edge_insensitive Homophily:", homophily(updated_data.edge_index, cp_data.y, method='edge_insensitive'))    
        
    None

Data directory:  ./Dataset/
Result directory: ./Dataset/RESULTS/

Dataset: KarateClub():
Number of graphs: 1
Number of features: 34
Number of classes: 4

Data(x=[34, 34], edge_index=[2, 156], y=[34], train_mask=[34], val_mask=[34], test_mask=[34])
Number of nodes: 34
Number of edges: 156
Average node degree: 4.59
Number of training nodes: 4
Training node label rate: 0.12
Has isolated nodes: False
Has self-loops: False
Is undirected: True
Data(x=[34, 34], edge_index=[2, 156], y=[34], train_mask=[34], val_mask=[34], test_mask=[34])
Metric:  cosine


Nodes: 100%|██████████| 34/34 [00:00<00:00, 570.03it/s]

Execution time:  0.0634160041809082



