# 0 - Installation & Imports

In [2]:
%%time
!pip install -q networkx
!pip install -q ogb
!pip install -q torch_geometric

CPU times: user 60.5 ms, sys: 13.7 ms, total: 74.2 ms
Wall time: 9.65 s


In [3]:
from os import mkdir,path
import math
import numpy
from pandas import DataFrame
from tqdm.notebook import tqdm
from IPython.display import clear_output


from torch import serialization
from torch_geometric.data.storage import GlobalStorage
from torch_geometric.data.data import DataEdgeAttr, DataTensorAttr
serialization.add_safe_globals([GlobalStorage, DataEdgeAttr, DataTensorAttr])

import networkx as nx

from ogb.linkproppred import PygLinkPropPredDataset

from torch_geometric.utils import to_networkx ,to_networkit,to_trimesh,get_embeddings

# 1 - Data Loading and Preparation

In [4]:
dataset = PygLinkPropPredDataset(name='ogbl-ddi')
clear_output()
graph_data = dataset[0]

In [5]:
Graph = to_networkx(graph_data).to_undirected()
Graph.number_of_edges()

1067911

In [6]:
features_save_path = "Features - NetworkX"
if not path.exists(features_save_path):
    mkdir(features_save_path)

# 2 - Computing Link Prediction Features

## 2.1 - Resource Allocation index

In [7]:
%%time
list_of_edge_dicts = []
RA_all_edges = nx.resource_allocation_index(Graph ,Graph.edges())
for n_u,n_v,value in tqdm(RA_all_edges, total=Graph.number_of_edges()):
    list_of_edge_dicts.append( {'node_u':int(n_u),'node_v':int(n_v),'nx.resource_allocation_index':value} )
df = DataFrame(list_of_edge_dicts)
df.to_csv(path.join(features_save_path,'Resource Allocation index.csv'),index=False)
df.sample(4)

  0%|          | 0/1067911 [00:00<?, ?it/s]

CPU times: user 7min 53s, sys: 3.5 s, total: 7min 56s
Wall time: 8min 16s


Unnamed: 0,node_u,node_v,nx.resource_allocation_index
892045,2239,3672,1.011605
430165,691,972,0.753832
88753,184,1616,0.56141
960766,2526,2630,0.55322


## 2.2 - Adamic Adar index

In [8]:
%%time
list_of_edge_dicts = []
AA_all_edges = nx.adamic_adar_index(Graph ,Graph.edges())
for n_u,n_v,value in tqdm(AA_all_edges, total=Graph.number_of_edges()):
    list_of_edge_dicts.append( {'node_u':int(n_u),'node_v':int(n_v),'nx.adamic_adar_index':value} )
df = DataFrame(list_of_edge_dicts)
df.to_csv( path.join(features_save_path,'Adamic Adar index.csv') ,index=False)
df.sample(4)

  0%|          | 0/1067911 [00:00<?, ?it/s]

CPU times: user 8min 54s, sys: 3.77 s, total: 8min 58s
Wall time: 9min 16s


Unnamed: 0,node_u,node_v,nx.adamic_adar_index
122103,225,808,103.381337
461148,741,2888,28.194509
93094,189,3620,100.056564
604271,1035,3481,84.882718


## 2.3 - Jaccard Coefficient

In [9]:
%%time
list_of_edge_dicts = []
Jaccard_all_edges = nx.jaccard_coefficient(Graph ,Graph.edges())
for n_u,n_v,value in tqdm(Jaccard_all_edges, total=Graph.number_of_edges()):
    list_of_edge_dicts.append( {'node_u':int(n_u),'node_v':int(n_v),'nx.jaccard_coefficient':value} )
df = DataFrame(list_of_edge_dicts)
df.to_csv( path.join(features_save_path,'Jaccard Coefficient.csv') ,index=False)
df.sample(4)

  0%|          | 0/1067911 [00:00<?, ?it/s]

CPU times: user 5min 28s, sys: 2.39 s, total: 5min 30s
Wall time: 5min 38s


Unnamed: 0,node_u,node_v,nx.jaccard_coefficient
563179,941,971,0.400489
790945,1684,1940,0.233202
812913,1817,2993,0.803625
31417,77,1834,0.396641


## 2.4 Preferential Attachment

In [10]:
list_of_edge_dicts = []
PE_all_edges = nx.preferential_attachment(Graph ,Graph.edges())
for n_u,n_v,value in tqdm(PE_all_edges, total=Graph.number_of_edges()):
    list_of_edge_dicts.append( {'node_u':int(n_u),'node_v':int(n_v),'nx.preferential_attachment':value} )
df = DataFrame(list_of_edge_dicts)
df.to_csv(path.join(features_save_path,'Preferential Attachment.csv'),index=False)
df.sample(4)

  0%|          | 0/1067911 [00:00<?, ?it/s]

Unnamed: 0,node_u,node_v,nx.preferential_attachment
570246,958,3737,719280
930674,2395,2616,883796
592444,1008,2226,681378
909952,2309,4248,255712


## 2.5 - Edge Betweenness Centrality

In [11]:
%%time
edge_betweenness_C = nx.edge_betweenness_centrality(Graph, k=100)
edge_betweenness_C = DataFrame( {'Edge':edge_betweenness_C.keys(),
                                 'Edge Betweenness Centrality':edge_betweenness_C.values()} )
edge_betweenness_C.sample(4)

CPU times: user 1min 43s, sys: 424 ms, total: 1min 43s
Wall time: 1min 44s


Unnamed: 0,Edge,Edge Betweenness Centrality
781047,"(1643, 3888)",2.047369e-08
897497,"(2264, 3036)",1.331917e-07
295211,"(475, 1242)",1.227562e-08
813665,"(1823, 3440)",2.130092e-08


# 3 - Computing Centrality of Nodes

## 3.1 - Betweenness Centrality

In [12]:
%%time
betweenness_C = nx.betweenness_centrality(Graph,k=200)
betweenness_C = DataFrame( {'Node':betweenness_C.keys() , 'Betweenness Centrality':betweenness_C.values()} )
betweenness_C.to_csv(path.join(features_save_path,'Nodes Betweenness Centrality.csv'),index=False)
betweenness_C.sample(4)

CPU times: user 2min 6s, sys: 341 ms, total: 2min 6s
Wall time: 2min 7s


Unnamed: 0,Node,Betweenness Centrality
2219,2219,3.034315e-05
2988,2988,4.655906e-06
1550,1550,2.108634e-08
497,497,6.067843e-06


## 3.2 - Degree Centrality

In [13]:
%%time
degree_C = nx.degree_centrality(Graph)
degree_C = DataFrame( {'Node':degree_C.keys() , 'Degree Centrality':degree_C.values()} )
degree_C.to_csv(path.join(features_save_path,'Nodes Degree Centrality.csv'),index=False)
degree_C.sample(4)

CPU times: user 21 ms, sys: 2 ms, total: 23.1 ms
Wall time: 22.6 ms


Unnamed: 0,Node,Degree Centrality
3049,3049,0.1594
880,880,0.138772
904,904,0.176278
1088,1088,0.125176


## 3.3 - Eigenvector Centrality

In [14]:
%%time
eigenvector_C = nx.eigenvector_centrality(Graph, max_iter=80)
eigenvector_C = DataFrame( {'Node':eigenvector_C.keys() , 'Eigenvector Centrality':eigenvector_C.values()} )
eigenvector_C.to_csv(path.join(features_save_path,'Nodes Eigenvector Centrality.csv'),index=False)
eigenvector_C.sample(4)

CPU times: user 6.62 s, sys: 23 ms, total: 6.65 s
Wall time: 6.72 s


Unnamed: 0,Node,Eigenvector Centrality
3711,3711,0.000327
147,147,0.012675
3954,3954,0.006876
69,69,6e-05


## 3.4 - Information Centrality

In [15]:
%%time
information_C = nx.information_centrality(Graph)
information_C = DataFrame( {'Node':information_C.keys() , 'Information Centrality':information_C.values()} )
information_C.to_csv(path.join(features_save_path,'Nodes Information Centrality.csv'),index=False)
information_C.sample(4)

CPU times: user 1min 41s, sys: 972 ms, total: 1min 42s
Wall time: 1min 43s


Unnamed: 0,Node,Information Centrality
243,243,0.004638
4070,4070,0.004631
534,534,0.004471
20,20,0.004541


## 3.5 - Current Flow Closeness Centrality

In [16]:
%%time
current_flow_closeness_C = nx.current_flow_closeness_centrality(Graph)
current_flow_closeness_C = DataFrame({'Node':current_flow_closeness_C.keys(),
                                      'Current Flow Closeness Centrality':current_flow_closeness_C.values()} )
current_flow_closeness_C.to_csv(path.join(features_save_path,'Nodes Current Flow Closeness Centrality.csv'),
                                index=False)
current_flow_closeness_C.sample(4)

CPU times: user 1min 40s, sys: 421 ms, total: 1min 41s
Wall time: 1min 41s


Unnamed: 0,Node,Current Flow Closeness Centrality
1980,1980,0.004291
3411,3411,0.00445
3250,3250,0.004741
1113,1113,0.004669


## 3.6 - Approximate Current Flow Betweenness Centrality

In [17]:
%%time
apprx_crrnt_flow_brwnns_C = nx.approximate_current_flow_betweenness_centrality(Graph)
apprx_crrnt_flow_brwnns_C = DataFrame({'Node':apprx_crrnt_flow_brwnns_C.keys(),
                           'Approximate Current Flow Betweenness Centrality':apprx_crrnt_flow_brwnns_C.values()} )
apprx_crrnt_flow_brwnns_C.to_csv(path.join(features_save_path,'Nodes Approximate Current Flow Betweenness Centrality.csv'),
                                index=False)
apprx_crrnt_flow_brwnns_C.sample(4)

CPU times: user 3min 36s, sys: 1.83 s, total: 3min 38s
Wall time: 3min 30s


Unnamed: 0,Node,Approximate Current Flow Betweenness Centrality
1355,1355,0.000463
338,338,0.001493
742,742,0.000542
2849,2849,0.000848


# 4 - Link Analysis for Nodes

## 4.1 - PageRank

In [18]:
%%time
pagerank = nx.pagerank(Graph, max_iter=4000)
pagerank = DataFrame( {'Node':pagerank.keys() , 'PageRank':pagerank.values()} )
pagerank.to_csv(path.join(features_save_path,'Nodes PageRank.csv'),index=False)
pagerank.sample(5)

CPU times: user 4.32 s, sys: 112 ms, total: 4.43 s
Wall time: 4.41 s


Unnamed: 0,Node,PageRank
1230,1230,0.000341
3509,3509,0.000295
4140,4140,0.000195
3569,3569,5.7e-05
3874,3874,0.000263


In [19]:
%%time
Authorities,Hubs = nx.hits(Graph, max_iter=4000)
hits = DataFrame( {'Node':Authorities.keys() , 'Authorities':Authorities.values() , 'Hubs':Hubs.values()} )
hits.to_csv(path.join(features_save_path,'Nodes Authorities & Hubs.csv'),index=False)
hits.sample(5)

CPU times: user 4.88 s, sys: 114 ms, total: 5 s
Wall time: 5.6 s


Unnamed: 0,Node,Authorities,Hubs
296,296,4.586015e-06,4.586015e-06
661,661,1.023931e-05,1.023931e-05
3051,3051,0.0002502904,0.0002502904
1928,1928,0.0001001506,0.0001001506
1535,1535,5.690863e-07,5.690863e-07


# 5 - Saving Features

In [20]:
!zip -r Features\ -\ NetworkX.zip Features\ -\ NetworkX

  adding: Features - NetworkX/ (stored 0%)
  adding: Features - NetworkX/Nodes Eigenvector Centrality.csv (deflated 57%)
  adding: Features - NetworkX/Nodes Current Flow Closeness Centrality.csv (deflated 59%)
  adding: Features - NetworkX/Nodes Information Centrality.csv (deflated 59%)
  adding: Features - NetworkX/Nodes Degree Centrality.csv (deflated 75%)
  adding: Features - NetworkX/Preferential Attachment.csv (deflated 63%)
  adding: Features - NetworkX/Nodes Approximate Current Flow Betweenness Centrality.csv (deflated 57%)
  adding: Features - NetworkX/Nodes PageRank.csv (deflated 58%)
  adding: Features - NetworkX/Jaccard Coefficient.csv (deflated 61%)
  adding: Features - NetworkX/Nodes Betweenness Centrality.csv (deflated 57%)
  adding: Features - NetworkX/Adamic Adar index.csv (deflated 56%)
  adding: Features - NetworkX/Nodes Authorities & Hubs.csv (deflated 69%)
  adding: Features - NetworkX/Resource Allocation index.csv (deflated 58%)


In [21]:
from google.colab import files
files.download('Features - NetworkX.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>