# Test Dataset to check

In [1]:
import multiprocessing

NUM_PROCESSORS=multiprocessing.cpu_count()
#print("Cpu count: ",NUM_PROCESSORS)

In [2]:
#as it turned out interactive shell (like Jupyter cannot handle CPU multiprocessing well so check which medium the code is runing)
#we will write code in Jupyter for understanding purposes but final execuation will be in shell
from ipynb.fs.full.Utils import isnotebook
from ipynb.fs.full.Dataset import get_data, generate_synthetic
from torch_geometric.utils import to_networkx, from_networkx
import torch_geometric.utils.homophily as homophily
import copy

In [3]:
import torch
import torch.nn as nn
from torch_sparse import SparseTensor
from tqdm import tqdm
import math
import time

import random
random.seed(12345)
import numpy as np
np.random.seed(12345)

In [4]:
import sklearn
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import pairwise_kernels
from joblib import Parallel, delayed
from multiprocessing.pool import ThreadPool, Pool

In [5]:
import numpy as np
from apricot import FeatureBasedSelection, MaxCoverageSelection, FacilityLocationSelection
from apricot import GraphCutSelection, SumRedundancySelection, SaturatedCoverageSelection, MixtureSelection
from apricot import BaseSelection
import numpy

In [6]:
from CustomGraphBased import CustomGraphBased
from CustomFeatureBased import FeatureBasedSelection
from CustomMaxCoverage import MaxCoverageSelection
from CustomFacilityLocation import FacilityLocationSelection

In [7]:
# X = numpy.exp(numpy.random.randn(1000, 100))
# model = FacilityLocationSelection(999, optimizer='approximate-lazy')
# start = time.time()
# model.fit(X)
# print("Time:", time.time()-start)

# Apricot Facility function implementation

In [8]:
class SubModularWeightApricot():
    
    def __init__(self, data, sub_func='facility', log=True, metric='cosine', concave_func='sqrt', lambda1=0.25, lambda2=0.25, w1=1.0, w2=0.5, w3=0.1):
        
        self.N = N = data.num_nodes
        self.E = E = data.num_edges
        self.data = data
        self.log = log
        
        self.lambda1=lambda1
        self.lambda2=lambda2
        self.w1=w1
        self.w2=w2
        self.w3=w3
        
        self.metric = metric
        self.sub_func = sub_func
        self.concave_func=concave_func
        
        if self.log:
            print(self.metric)

        self.adj = SparseTensor(
            row=data.edge_index[0], col=data.edge_index[1],
            value=torch.arange(E, device=data.edge_index.device),
            sparse_sizes=(N, N))
        
    def lazy_greedy_weight(self,u):
    
        row, col, edge_index = self.adj[u,:].coo()
        
        if len(col)==0:
            return [],[]
                
#         print(len(row),row)
#         print(len(col),col)
#         print(len(edge_index),edge_index)
            
        vertices = [u]+col.tolist()
        
        lambda1 = self.lambda1 #top 25% with probability 1
        lambda2 = self.lambda2 #second 25% with probability 0.5         
        l1=math.ceil(len(col)*lambda1)
        l2=min(len(col)-l1,math.ceil(len(col)*lambda2))
        l3=max(0,int(len(col)-l1-l2))
                
#         print(l1,l2)
#         n_jobs=NUM_PROCESSORS

        start = time.time()
        
        if self.sub_func =='facility':
            model = FacilityLocationSelection(l1+l2, initial_subset=[0], metric=self.metric, optimizer='approximate-lazy')
        elif self.sub_func =='graph':        
            model = CustomGraphBased(l1+l2, initial_subset=[0], metric=self.metric, optimizer='approximate-lazy')
        elif self.sub_func =='feature':        
            model = FeatureBasedSelection(l1+l2, initial_subset=[0], concave_func=self.concave_func, optimizer='approximate-lazy')
        elif self.sub_func =='coverage':
            model = MaxCoverageSelection(l1+l2, initial_subset=[0], optimizer='approximate-lazy')
        else:
            raise NotImplemented
    
        end = time.time()
        
        
        start2 = time.time()
        #model.fit(self.data.x[vertices])
        model.fit(self.data.x[vertices].numpy())
        end2 = time.time()

#         print("init:",end-start)
#         print("Fit:",end2-start2)
                
#         
        ranks = model.ranking
        
#         print(vertices)
#         print(ranks, len(ranks))
        
        set1 = set(range(1,len(vertices)))
        set2 = set(ranks)
        other_ranks = list(set1 - set2)
        
#         print(other_ranks)        
        
        v2i={i:j for i,j in zip(vertices, range(len(vertices)))}
        i2v={value:key for key, value in v2i.items()}
        v2e={key.item():value.item() for key, value in zip(col,edge_index)}
        
        S_G=[]
        S_edge=[]
        
        rank=1
        
        for r in ranks:
            if rank <= l1:
                S_G.append(self.w1)                
            elif rank<=l1+l2:
                S_G.append(self.w2)
            else:
                S_G.append(self.w3)
            S_edge.append(v2e[i2v[r]])
            rank+=1
        
        for r in other_ranks:
            S_G.append(self.w3)
            S_edge.append(v2e[i2v[r]])
            rank+=1                    
            
        return S_G, S_edge

    def get_submodular_weight(self):
        
        if self.log:
            pbar = tqdm(total=self.N)
            pbar.set_description(f'Nodes')

        edge_weight=[]
        edge_index=[]

        for u in range(self.N):
            
            weight, e_index = self.lazy_greedy_weight(u)
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            if self.log:
                pbar.update(1)
        if self.log:
            pbar.close()
        
        assert len(edge_index)==self.E
        
        weight=torch.zeros(len(edge_weight))
        for e,w in zip(edge_index,edge_weight):
            weight[e]=w

        return weight
    
    def process_block(self, list_u):
        
        #print("Processing :",len(list_u), list_u[0], list_u[-1])
        
        edge_weight = []
        edge_index = []
        
        for u in list_u:        
            weight, e_index = self.lazy_greedy_weight(u)            
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            
        #print("Done :",len(list_u), list_u[0], list_u[-1])
            
        return edge_weight, edge_index, len(list_u) 
    
    #multiprocessing
    def get_submodular_weight_multiproces(self):
        
        edge_weight=[]
        edge_index=[]        
        
        N = self.N
        num_blocks = NUM_PROCESSORS
        elem_size = int(N/num_blocks)
                
        nodes = np.arange(num_blocks*elem_size).reshape(num_blocks,-1).tolist()
        if num_blocks*elem_size<N:
            nodes.append(list(range(num_blocks*elem_size,N)))        
        
        pool_size = NUM_PROCESSORS        
        
        if self.log:
            print("Pool Size: ", pool_size)        
        pool = Pool(pool_size)
        
        if self.log:
            pbar = tqdm(total=N)
            pbar.set_description(f'Nodes')  
                
        for (weight, e_index, num_el) in pool.imap_unordered(self.process_block, nodes):            
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            if self.log:
                pbar.update(num_el)
        if self.log:
            pbar.close()
        
        assert len(edge_index)==self.E
                
        weight=torch.zeros(len(edge_index))        
        weight[edge_index]=torch.Tensor(edge_weight)        
        
        return weight
    
    
    def compute_weights(self):
        
        weight = self.get_submodular_weight_multiproces()
        
#         if data.num_nodes<10000:
#             weight = self.get_submodular_weight()    
#         else:
#             weight = self.get_submodular_weight_multiproces()
        
        return weight
    
# data, dataset = get_data('karate', log=False,h_score=False)
# submodular_weight = SubModularWeightApricot(data, log=True, sub_func='facility', metric='euclidean')

# start = time.time()
# #submodular_weight.get_submodular_weight()
# # submodular_weight.lazy_greedy_weight(0)
# submodular_weight.compute_weights()

# # for i in range(10):
# #      submodular_weight.lazy_greedy_weight(i)
# end = time.time()
# print("Execution time: ", end-start)

In [22]:
data, dataset = get_data('Cora', log=False,h_score=True)
submodular_weight = SubModularWeightApricot(data)
start = time.time()
# submodular_weight.process_block(list(range(30)))
# submodular_weight.compute_weights()
submodular_weight.get_submodular_weight()    
end = time.time()
print("Execution time: ", end-start)

N  2708  E  10556  d  3.8980797636632203 0.825157880783081 0.8099659085273743 0.7657181620597839 -0.06587088108062744 cosine


Nodes:  19%|█▉        | 516/2708 [18:18<1:10:45,  1.94s/it]Exception ignored on calling ctypes callback function: <function ExecutionEngine._raw_object_cache_getbuffer at 0x7f61c7270790>
Traceback (most recent call last):
  File "/home/njbrunsi/anaconda3/envs/py38cu11/lib/python3.8/site-packages/llvmlite/binding/executionengine.py", line 196, in _raw_object_cache_getbuffer
    module = self._find_module_ptr(module_ptr)
  File "/home/njbrunsi/anaconda3/envs/py38cu11/lib/python3.8/site-packages/llvmlite/binding/executionengine.py", line 144, in _find_module_ptr
    if cast(module._ptr, c_void_p).value == ptr:
KeyboardInterrupt: 
Nodes:  19%|█▉        | 522/2708 [18:30<1:19:22,  2.18s/it]

# Main

In [10]:
if __name__ == '__main__':  
    
    data, dataset = get_data('karate', log=False,h_score=False)
    submodular_weight = SubModularWeightApricot(data, log=True, sub_func='facility', metric='euclidean')

    start = time.time()
    #submodular_weight.get_submodular_weight()
    # submodular_weight.lazy_greedy_weight(0)
    submodular_weight.compute_weights()

    # for i in range(10):
    #      submodular_weight.lazy_greedy_weight(i)
    end = time.time()
    print("Execution time: ", end-start)
    
    None

euclidean
Pool Size:  40


Nodes: 100%|██████████| 34/34 [00:07<00:00,  4.46it/s]


Execution time:  8.476069927215576


In [11]:
# X = numpy.exp(numpy.random.randn(4000, 602))

# model = CustomSubmodularSelector(3999, initial_subset=[0], metric='cosine', optimizer='approximate-lazy')
# start = time.time()
# model.fit(X)
# print("Time:", time.time()-start)

# model.ranking

In [12]:
# import numpy
# from apricot import FacilityLocationSelection

# X = numpy.random.normal(20, 1, size=(10000, 1000))
# # X_reordered = FacilityLocationSelection(100, initial_subset=[0]).fit_transform(X)

# start = time.time()
# model = FacilityLocationSelection(1000).fit(X)
# X_reordered2 = X[model.ranking]
# end = time.time()

# print(end-start)

In [13]:
# import time
# import numpy as np
# from apricot import FacilityLocationSelection
# from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# #X = numpy.random.normal(2, 1, size=(10000, 1000))

# X =np.array([[0,0],[0.8,0.0],[0.2,0.6]])

# sim = euclidean_distances(X)

# print(sim)
# sim = np.max(sim)-sim
# print(sim)

# #print(np.sum(sim,axis=0))


# # X_reordered = FacilityLocationSelection(100, initial_subset=[0]).fit_transform(X)

# start = time.time()
# model = FacilityLocationSelection(3, metric='euclidean',optimizer="naive").fit(X)

# print(model.ranking)
# print(model.gains)

# #X_reordered2 = X[model.ranking]
# end = time.time()

# print(end-start)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.sparse import csr_matrix
import numpy
from sklearn.metrics import pairwise_distances

def calculate_pairwise_distances(X, Y=None, metric='precomputed', n_neighbors=None):
    if metric in ('precomputed', 'ignore'):
        return X

    if n_neighbors is None:
        if metric == 'euclidean':
            X_pairwise = pairwise_distances(X, Y=Y, metric=metric, squared=True)
        elif metric == 'correlation' or metric == 'cosine':
            # An in-place version of:
            # X_pairwise = 1 - (1 - pairwise_distances(X, metric=metric)) ** 2
            
            X_pairwise = pairwise_distances(X, Y=Y, metric=metric)
            X_pairwise = numpy.subtract(1, X_pairwise, out=X_pairwise)
            X_pairwise = numpy.square(X_pairwise, out=X_pairwise)
            X_pairwise = numpy.subtract(1, X_pairwise, out=X_pairwise)
        else:
            X_pairwise = pairwise_distances(X, Y=Y, metric=metric)
    else:
        if metric == 'correlation' or metric == 'cosine':
            # An in-place version of:
            # X = 1 - (1 - pairwise_distances(X, metric=metric)) ** 2

            X = pairwise_distances(X, Y=Y, metric=metric)
            X = numpy.subtract(1, X, out=X)
            X = numpy.square(X, out=X)
            X = numpy.subtract(1, X, out=X)
            metric = 'precomputed'

        if isinstance(n_neighbors, int):
            X_pairwise = KNeighborsTransformer(
                n_neighbors=n_neighbors, metric=metric
                ).fit_transform(X)

        elif isinstance(n_neighbors, KNeighborsTransformer):
            X_pairwise = n_neighbors.fit_transform(X)

    if metric == 'correlation' or metric == 'cosine':
        if isinstance(X_pairwise, csr_matrix):
            X_pairwise.data = numpy.subtract(1, X_pairwise.data, 
                out=X_pairwise.data)
        else:
            X_pairwise = numpy.subtract(1, X_pairwise,
                out=X_pairwise)
    else:
        if isinstance(X_pairwise, csr_matrix):
            X_pairwise.data = numpy.subtract(X_pairwise.max(),
                X_pairwise.data, out=X_pairwise.data)
        else:
            X_pairwise = numpy.subtract(X_pairwise.max(), X_pairwise,
                out=X_pairwise)

    return X_pairwise

In [15]:
import time
import numpy as np
from apricot import FacilityLocationSelection
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances


X =np.array([[0,0],[1,0],[2,0],[5,0],[7,0],[8,0],[9,0]])

start = time.time()
model = FacilityLocationSelection(7, metric='euclidean',optimizer="naive").fit(X)
# model = FacilityLocationSelection(4, initial_subset=[0], metric='euclidean',optimizer="naive").fit(X)

print(model.ranking)
print(model.gains)

end = time.time()

print(end-start)

[3 1 5 0 2 4 6]
[488.  48.  27.   1.   1.   1.   1.]
2.390138626098633


In [16]:
X_pair = calculate_pairwise_distances(X, metric='euclidean')

X_pair


for i in range(len(X)):
    for j in range(len(X)):
        print(int(X_pair[i,j]),end="")
        if j<len(X)-1:
            print(end= " & ")
    print("\\\\")

81 & 80 & 77 & 56 & 32 & 17 & 0\\
80 & 81 & 80 & 65 & 45 & 32 & 17\\
77 & 80 & 81 & 72 & 56 & 45 & 32\\
56 & 65 & 72 & 81 & 77 & 72 & 65\\
32 & 45 & 56 & 77 & 81 & 80 & 77\\
17 & 32 & 45 & 72 & 80 & 81 & 80\\
0 & 17 & 32 & 65 & 77 & 80 & 81\\


In [17]:
X_pairwise = pairwise_distances(X, metric='euclidean', squared=False)

np.power(X_pairwise,2).max()-np.power(X_pairwise,2)

array([[81., 80., 77., 56., 32., 17.,  0.],
       [80., 81., 80., 65., 45., 32., 17.],
       [77., 80., 81., 72., 56., 45., 32.],
       [56., 65., 72., 81., 77., 72., 65.],
       [32., 45., 56., 77., 81., 80., 77.],
       [17., 32., 45., 72., 80., 81., 80.],
       [ 0., 17., 32., 65., 77., 80., 81.]])

In [18]:
sum(np.max(X_pair[[3,5]],axis=0))-sum(np.max(X_pair[[3]],axis=0))

27.0

In [19]:
for i in range(len(X)):
    for j in range(2):
        print(int(X[i,j]),end="")
        if j<1:
            print(end= " & ")
    print("\\\\")

0 & 0\\
1 & 0\\
2 & 0\\
5 & 0\\
7 & 0\\
8 & 0\\
9 & 0\\


In [20]:
# sum(np.max(sim[:2],axis=0))-sum(np.max(sim[:1],axis=0))

In [21]:
# sim = np.array(
#     [[0.72, 0.08, 0.32],
#      [0.08, 0.72, 0.  ],
#      [0.32, 0.,   0.72]])

# sum(np.max(sim,axis=0))-sum(np.max(sim[:2],axis=0))