In [3]:
! pip install ogb
! pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.0.0+cu118.html
! pip install torch_geometric

Collecting ogb
  Using cached ogb-1.3.6-py3-none-any.whl (78 kB)
Collecting pandas>=0.24.0
  Downloading pandas-2.0.1-cp311-cp311-win_amd64.whl (10.6 MB)
     -------------------------------------- 10.6/10.6 MB 182.5 kB/s eta 0:00:00
Collecting outdated>=0.2.0
  Using cached outdated-0.2.2-py2.py3-none-any.whl (7.5 kB)
Collecting littleutils
  Using cached littleutils-0.2.2.tar.gz (6.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pytz>=2020.1
  Using cached pytz-2023.3-py2.py3-none-any.whl (502 kB)
Collecting tzdata>=2022.1
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Building wheels for collected packages: littleutils
  Building wheel for littleutils (setup.py): started
  Building wheel for littleutils (setup.py): finished with status 'done'
  Created wheel for littleutils: filename=littleutils-0.2.2-py3-none-any.whl size=7034 sha256=ea4442315604fd9a8cebeeeb6afd9fd3c971e974987393c4bddc94cbc4c322f6
  

In [1]:
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator
import torch_geometric.transforms as T
from joblib import dump, load
import torch
import networkx as nx
import scipy
import itertools
import community as community_louvain
import heapq
import operator
import copy
from node2vec import Node2Vec
import random
from gensim.models import Word2Vec
import multiprocessing
import pandas as pd
from sklearn.cluster import KMeans
import torch.nn.functional as F
from torch_geometric.datasets import OGB_MAG
from torch_geometric.nn import GATConv,SAGEConv,GCNConv
# import cugraph
# import cudf

import plotly.graph_objects as go
import numpy as np
import plotly.express as px
import warnings
import matplotlib.pyplot as plt

plt.style.use('dark_background')
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
warnings.filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = PygNodePropPredDataset(name='ogbn-arxiv')
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [3]:
data = dataset[0]

In [4]:
node_feat = data.x
edge_index = data.edge_index
y_true = data.y
number_of_classes = 40

In [5]:
print(edge_index.tolist()[0][0])
print(edge_index.tolist()[1][0])

104447
13091


In [6]:
node_feat

tensor([[-0.0579, -0.0525, -0.0726,  ...,  0.1734, -0.1728, -0.1401],
        [-0.1245, -0.0707, -0.3252,  ...,  0.0685, -0.3721, -0.3010],
        [-0.0802, -0.0233, -0.1838,  ...,  0.1099,  0.1176, -0.1399],
        ...,
        [-0.2205, -0.0366, -0.4022,  ...,  0.1134, -0.1614, -0.1452],
        [-0.1382,  0.0409, -0.2518,  ..., -0.0893, -0.0413, -0.3761],
        [-0.0299,  0.2684, -0.1611,  ...,  0.1208,  0.0776, -0.0910]])

In [7]:
directed_graph = nx.DiGraph()

directed_graph.add_nodes_from((index, {'Features':node_feat[index].tolist()}) for index in range(data.num_nodes))

undirected_graph = nx.Graph()

undirected_graph.add_nodes_from((index, {'Features':node_feat[index].tolist()}) for index in range(data.num_nodes))


In [8]:
edges = edge_index.tolist()
for index in range(data.num_edges):
    directed_graph.add_edge(edges[0][index], edges[1][index])
    if index%100000 == 0:
        print(f'We create {index} edges in directed graph.')

edges = edge_index.tolist()
for index in range(data.num_edges):
    undirected_graph.add_edge(edges[0][index], edges[1][index])
    if index%100000 == 0:
        print(f'We create {index} edges in undirected graph.')

We create 0 edges in directed graph.
We create 100000 edges in directed graph.
We create 200000 edges in directed graph.
We create 300000 edges in directed graph.
We create 400000 edges in directed graph.
We create 500000 edges in directed graph.
We create 600000 edges in directed graph.
We create 700000 edges in directed graph.
We create 800000 edges in directed graph.
We create 900000 edges in directed graph.
We create 1000000 edges in directed graph.
We create 1100000 edges in directed graph.
We create 0 edges in undirected graph.
We create 100000 edges in undirected graph.
We create 200000 edges in undirected graph.
We create 300000 edges in undirected graph.
We create 400000 edges in undirected graph.
We create 500000 edges in undirected graph.
We create 600000 edges in undirected graph.
We create 700000 edges in undirected graph.
We create 800000 edges in undirected graph.
We create 900000 edges in undirected graph.
We create 1000000 edges in undirected graph.
We create 1100000 e

In [9]:
print('Number of nodes in graph is ', directed_graph.number_of_nodes())
print('Number of edges in graph is', directed_graph.number_of_edges())

Number of nodes in graph is  169343
Number of edges in graph is 1166243


In [10]:
avg_deg = round(data.num_edges/data.num_nodes, 2)
print('Average output/input degree is', avg_deg)

Average output/input degree is 6.89


In [11]:
input_degrees = directed_graph.in_degree()
output_degrees = directed_graph.out_degree()

In [13]:
len(input_degrees)

169343

In [14]:
num_input_degrees={}
for degree in range(len(input_degrees)):
    num_input_degrees[input_degrees[degree]] = num_input_degrees.get(input_degrees[degree], 0) + 1

input_degrees_counts = list(num_input_degrees.values())
input_degrees_size = list(num_input_degrees.keys())

In [14]:
scatter = go.Scatter(x=np.log2(input_degrees_size), y=np.log2(input_degrees_counts), mode="lines+markers")
fig = go.Figure(scatter)
fig.update_layout(
title="Input Degree Distribution Plot",
xaxis_title="Log2 Input Degree",
yaxis_title="Log2 The Number Of Repetitions",
template='plotly_dark'
)
fig.show()

In [15]:
num_output_degrees={}
for degree in range(len(output_degrees)):
    num_output_degrees[output_degrees[degree]] = num_output_degrees.get(output_degrees[degree], 0) + 1

output_degrees_counts = list(num_output_degrees.values())
output_degrees_size = list(num_output_degrees.keys())

In [16]:
scatter = go.Scatter(x=np.log2(output_degrees_size), y=np.log2(output_degrees_counts), mode="lines+markers")
fig = go.Figure(scatter)
fig.update_layout(
title="Output Degree Distribution Plot",
xaxis_title="Log2 Output Degree",
yaxis_title="Log2 The Number Of Repetitions",
template='plotly_dark'
)
fig.show()

In [17]:
avg_cluster_coef = round(nx.average_clustering(directed_graph),2)
print('Average clustering coefficient is', avg_cluster_coef)

Average clustering coefficient is 0.12


In [18]:
clusterings = nx.clustering(directed_graph)

In [19]:
clusterings

{0: 0.017300628036497217,
 1: 0.5,
 2: 0.10989010989010989,
 3: 0.5,
 4: 0.4,
 5: 0,
 6: 0,
 7: 0.5,
 8: 0,
 9: 0.3333333333333333,
 10: 0.1,
 11: 0.4,
 12: 0.23809523809523808,
 13: 0.2,
 14: 0.047619047619047616,
 15: 0.2076923076923077,
 16: 0.15,
 17: 0.023809523809523808,
 18: 0,
 19: 0,
 20: 0.13333333333333333,
 21: 0,
 22: 0,
 23: 0.17857142857142858,
 24: 0,
 25: 0,
 26: 0,
 27: 0.3333333333333333,
 28: 0.16666666666666666,
 29: 0.01422475106685633,
 30: 0.06904761904761905,
 31: 0.08157894736842106,
 32: 0,
 33: 0,
 34: 0.08333333333333333,
 35: 0.08333333333333333,
 36: 0,
 37: 0.05128205128205128,
 38: 0,
 39: 0,
 40: 0,
 41: 0.05555555555555555,
 42: 0.06666666666666667,
 43: 0.055384615384615386,
 44: 0.5,
 45: 0.2,
 46: 0,
 47: 0.08333333333333333,
 48: 0,
 49: 0,
 50: 0,
 51: 0,
 52: 0.05555555555555555,
 53: 0,
 54: 0.3333333333333333,
 55: 0.16666666666666666,
 56: 0,
 57: 0,
 58: 0,
 59: 0.16666666666666666,
 60: 0.5,
 61: 0.22727272727272727,
 62: 0.2083333333333333

In [20]:
num_clusterings={}
for cluster_coe in range(len(clusterings)):
    num_clusterings[clusterings[cluster_coe]] = num_clusterings.get(clusterings[cluster_coe], 0) + 1

cluster_coe_counts = list(num_clusterings.values())
cluster_coe_size = list(num_clusterings.keys())

In [21]:
scatter = go.Scatter(x=cluster_coe_size, y=np.log2(cluster_coe_counts), mode="markers")
fig = go.Figure(scatter)
fig.update_layout(
title="Clustering Coefficient Distribution Plot",
xaxis_title="Clustering Coefficient",
yaxis_title="Log2 The Number Of Repetitions",
template='plotly_dark'
)
fig.show()

In [43]:
triangles = nx.triangles(undirected_graph)

In [46]:
num_triangles={}
for triangel in triangles.keys():
    num_triangles[triangles[triangel]] = num_triangles.get(triangles[triangel], 0) + 1

triangle_counts = list(num_triangles.values())
triangle_size = list(num_triangles.keys())


In [47]:
scatter = go.Scatter(x=np.log2(triangle_size), y=np.log2(triangle_counts), mode="markers")
fig = go.Figure(scatter)
fig.update_layout(
title="Triangle Distribution Plot",
xaxis_title="Log2 Triangles For One Node",
yaxis_title="Log2 The Number Of Repetitions",
template='plotly_dark'
)
fig.show()

## Compute graphlets
For every node we extract five features:
1. output degree (graphlet of size 2)
2. input degree (graphlet of size 2)
3. graphlet of size 3 that 3 node are in a line the first node in line is root
4. graphlet of size 3 that 3 node are in a line the second node in line is root
5. graphlet of size 3 that 3 node are triangle

In [15]:
transform_dataset = PygNodePropPredDataset(name='ogbn-arxiv',transform=T.ToSparseTensor() )

In [26]:
transform_data = transform_dataset[0]
transform_data.adj_t

SparseTensor(row=tensor([     0,      0,      0,  ..., 169341, 169341, 169341]),
             col=tensor([   411,    640,   1162,  ...,  30351,  35711, 103121]),
             size=(169343, 169343), nnz=1166243, density=0.00%)

In [27]:
adj_matrix = transform_data.adj_t.to_symmetric()
adj_matrix = adj_matrix.to_torch_sparse_coo_tensor()

In [28]:
adj_matrix_pow_2 = torch.sparse.mm(adj_matrix, adj_matrix)

In [29]:
adj_matrix_pow_2

tensor(indices=tensor([[     0,      0,      0,  ..., 169342, 169342, 169342],
                       [     0,    112,    244,  ..., 168387, 168630, 169342]]),
       values=tensor([291.,   2.,   1.,  ...,   1.,   1.,   2.]),
       size=(169343, 169343), nnz=589834701, layout=torch.sparse_coo)

Due to the lack of access to enough memory, I could not run this code on the CPU and not on the GPU because it requires 50 GB of free memory.

In [30]:
adj_matrix_pow_3 = torch.sparse.mm(adj_matrix_pow_2, adj_matrix)

RuntimeError: [enforce fail at C:\cb\pytorch_1000000000000\work\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 49994784256 bytes.

In [None]:
adj_matrix_pow_3

In [20]:
is_strong_connected = nx.is_strongly_connected(directed_graph)
num_strong_connected = nx.number_strongly_connected_components(directed_graph)
largest_strongly_component = max(nx.strongly_connected_components(directed_graph), key=len)
if is_strong_connected:
    print('Is strongly connectec')
else:
    print('Is not strongly connected')
print(f'Number of stongly connected components in dataset is {num_strong_connected}')
print(f'Largest strongly component contains {len(largest_strongly_component)} nodes')

Is not strongly connected
Number of stongly connected components in dataset is 141223
Largest strongly component contains 23164 nodes


In [79]:
num_strongly={}
for strong_comp in nx.strongly_connected_components(directed_graph):
    num_strongly[len(strong_comp)] = num_strongly.get(len(strong_comp), 0)+1 
    
strongly_counts = list(num_strongly.values())
strongly_size = list(num_strongly.keys())

In [81]:
scatter = go.Scatter(x=np.log2(strongly_size), y=np.log2(strongly_counts), mode="markers+lines")
fig = go.Figure(scatter)
fig.update_layout(
title="Strongly Connected Component Size Distribution Plot",
xaxis_title="Log2 Strongly Connected Component Size",
yaxis_title="Log2 The Number Of Repetitions",
template='plotly_dark'
)
fig.show()

In [21]:
is_weak_connected = nx.is_weakly_connected(directed_graph)
num_weak_connected = nx.number_weakly_connected_components(directed_graph)
giant_weakly_component = max(nx.weakly_connected_components(directed_graph), key=len)
if is_weak_connected:
    print('Is weakly connectec')
else:
    print('Is not weakly connected')
print(f'Number of weakly connected components in dataset is {num_weak_connected}')
print(f'Largest weakly component contains {len(giant_weakly_component)} nodes')

Is weakly connectec
Number of weakly connected components in dataset is 1
Largest weakly component contains 169343 nodes


In [83]:
num_weakly={}
for weak_comp in nx.weakly_connected_components(directed_graph):
    num_weakly[len(weak_comp)] = num_weakly.get(len(weak_comp), 0)+1 
    
num_weakly_keys = list(num_weakly.keys())
num_weakly_keys.sort()
for key in num_weakly_keys:
    print(f'We have {num_weakly[key]} of size {key} attracting components.')

We have 1 of size 169343 attracting components.


In [23]:
is_attracting = nx.is_attracting_component(directed_graph)
num_attracting = nx.number_attracting_components(directed_graph)
largest_attracting = max(nx.attracting_components(directed_graph), key=len)
if is_attracting:
    print('Is an attracting component')
else:
    print('Is not an attracting component')
print(f'Number of attracting components in dataset is {num_attracting}')
print(f'Largest attracting component contains {len(largest_attracting)} nodes')

Is not an attracting component
Number of attracting components in dataset is 17647
Largest attracting component contains 9 nodes


In [84]:
num_attractings={}
for attract_comp in nx.attracting_components(directed_graph):
    num_attractings[len(attract_comp)] = num_attractings.get(len(attract_comp), 0)+1 
    
attracting_counts = list(num_attractings.values())
attracting_size = list(num_attractings.keys())

In [85]:
scatter = go.Scatter(x=attracting_size, y=np.log2(attracting_counts), mode="markers+lines")
fig = go.Figure(scatter)
fig.update_layout(
title="Attracting Component Size Distribution Plot",
xaxis_title="Attracting Component Size",
yaxis_title="Log2 The Number Of Repetitions",
template='plotly_dark'
)
fig.show()

In [24]:
is_semiconnected = nx.is_semiconnected(directed_graph)


In [9]:
diameter = nx.approximation.diameter(undirected_graph) 

In [34]:
y_true_set = []
for community_number in range(number_of_classes):
    y_true_set.append(set(index for index,value in enumerate(y_true) if value == community_number))

In [29]:
def evaluator(y_true, y_pred):
    y_true = copy.deepcopy(y_true)
    y_pred = copy.deepcopy(y_pred)
    right_predictions = 0
    total_predictions = 0
    for true_community in y_true:
        best_match = {}
        max_common =0
        for pred_community in y_pred:
            common = len(pred_community & true_community)
            if(common > max_common):
                best_match = pred_community
                max_common = common
        if(max_common != 0):
            right_predictions += max_common
            y_pred.remove(best_match)
        total_predictions += len(true_community)

    return right_predictions/total_predictions*100

In [30]:
def common_calculator(comm1, comm2):
    return len(comm2 & comm1)

def rank(List):
    output = []
    list_copy = copy.deepcopy(List)
    list_copy = list(dict.fromkeys(list_copy))
    list_copy.sort(reverse=True)
    for value in list_copy:
        for index in range(len(List)):
            if List[index] == value:
                output.append(index)
    return output
            
def priorities(List1, List2):
    output = []
    for comm1 in List1:
        temp = []
        for comm2 in List2:
            common = common_calculator(comm1,comm2)
            temp.append(common)
        output.append(rank(temp))
    return output

def find_pair(List, value):
    pair_value = -1
    for pair in List:
        if pair[1] == value:
            return pair[0]

def stable_matcher(chooser, wanted):
    pairs = []
    unmatched_chooser = [index for index in range(len(chooser))]
    unmatched_wanted = [index for index in range(len(wanted))]
    prior_chooser = priorities(chooser, wanted)
    prior_wanted = priorities(wanted, chooser)
    while(len(unmatched_chooser)):
        comm1 = unmatched_chooser.pop()
        for matching_pred in prior_chooser[comm1]:
            if matching_pred in unmatched_wanted:
                pairs.append([comm1, matching_pred])
                unmatched_wanted.remove(matching_pred)
                break
            else:
                matched_with_wanted = find_pair(pairs,matching_pred)
                if prior_wanted[matching_pred].index(comm1) < prior_wanted[matching_pred].index(matched_with_wanted):
                    pairs.remove([matched_with_wanted, matching_pred])
                    pairs.append([comm1, matching_pred])
                    unmatched_chooser.append(matched_with_wanted)
                    break
    return pairs

def stable_match_evaluator(y_true, y_pred):
    chooser = y_true
    wanted = y_pred
    pairs = stable_matcher(chooser, wanted)
    common = 0
    total = 0
    for pair in pairs:
        common += common_calculator(chooser[pair[0]], wanted[pair[1]])
        total += len(chooser[pair[0]])
    accuracy_on_y_true_choose = common/total

    chooser = y_pred
    wanted = y_true   
    pairs = stable_matcher(chooser, wanted)
    common = 0
    total = 0
    for pair in pairs:
        common += common_calculator(chooser[pair[0]], wanted[pair[1]])
        total += len(chooser[pair[0]])
    accuracy_on_y_pred_choose = common/total

    return max(accuracy_on_y_pred_choose,accuracy_on_y_true_choose)*100
    


Our network is so large then we should get some sample from it.
I wanna to use Forest fire sampling with landmarks, which selects a set of landmark nodes that are representative of different regions or communities of the graph, and starts a fire from each landmark node. This can ensure that the sample covers all the important parts of the graph and preserves the global structure.
Also we get the same diameter and clustering coef and we have sample over all communities.

In [218]:
def forest_fire_sampling_with_landmarks(G:nx.DiGraph, landmarks, p) -> nx.DiGraph:
    """
    G: input graph
    start_node: node to start the "fire"
    p: probability of "burning" a neighbor
    """
    visited = set()

    for start_node in landmarks:
        stack = [start_node]
        while stack:
            node = stack.pop()
            if node not in visited:
                visited.add(node)
                neighbors = list(G.neighbors(node))
                random.shuffle(neighbors)
                for neighbor in neighbors:
                    if random.random() < p:
                        stack.append(neighbor)

    return G.subgraph(visited)

In [234]:
log_num = []
for community in y_true_set:
    log_num.append(int(np.round(np.log10(len(community))))*2)

In [235]:
print(log_num)

[6, 6, 8, 6, 8, 8, 6, 6, 8, 6, 8, 6, 2, 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 8, 6, 8, 8, 8, 6, 8, 6, 6, 6, 8, 4, 8, 6, 6, 6]


In [236]:
landmarks = set()

for index in range(number_of_classes):
    for node in random.sample(population=list(y_true_set[index]), k=log_num[index]):
        landmarks.add(node)

In [237]:
print(landmarks)

{69123, 84996, 109065, 38922, 61963, 104970, 19978, 126991, 92176, 112659, 46100, 122389, 16916, 159764, 28693, 88089, 97813, 104474, 29205, 117790, 75296, 5665, 71205, 15911, 75308, 5676, 51759, 106032, 12336, 73778, 127023, 71220, 60980, 70710, 101937, 35384, 155705, 126522, 137785, 101437, 31294, 102461, 71232, 123968, 160317, 112707, 8773, 38477, 126029, 55375, 128592, 31316, 126037, 28246, 94298, 85595, 4700, 111197, 42590, 37471, 11870, 123490, 33378, 141414, 134247, 163943, 26216, 147562, 128618, 40046, 17519, 104559, 74350, 10864, 94327, 67705, 5242, 113275, 96889, 10370, 29315, 152196, 5764, 133254, 65671, 35977, 88714, 7818, 59019, 65165, 56462, 60048, 20627, 100504, 64670, 147109, 31910, 35498, 154798, 22706, 54966, 6839, 81080, 101561, 147132, 82621, 190, 2239, 81095, 10441, 51919, 43728, 168657, 59091, 90324, 118996, 77015, 87769, 46299, 4315, 115422, 49379, 24571, 100589, 25326, 79088, 148722, 65779, 106741, 122617, 28409, 88827, 21246, 107265, 92930, 169221, 87815, 2824,

In [238]:
burn_prob = 0.2
sampled_digraph = forest_fire_sampling_with_landmarks(directed_graph, landmarks, burn_prob)
sampled_undigraph = sampled_digraph.to_undirected()

In [243]:
dump(sampled_digraph.nodes(), "Checkpoints/sample_graph_nodes.joblib")

['Checkpoints/sample_graph_nodes.joblib']

Load sampled graph

In [16]:
sampled_digraph_nodes = load('Checkpoints/sample_graph_nodes.joblib')
sampled_digraph = directed_graph.subgraph(sampled_digraph_nodes)

In [17]:
sampled_digraph.number_of_nodes()

1538

In [18]:
sampled_digraph.number_of_edges()

8759

In [241]:
nx.average_clustering(sampled_digraph)

0.1515272953207338

In [244]:
sampled_y_true = y_true[list(sampled_digraph.nodes())]

#### louvain on cugraph (should only apply this part on linux)

In [None]:
! conda install -c rapidsai -c conda-forge -c nvidia cugraph cudatoolkit=11.8

In [None]:
edge_list = nx.to_pandas_edgelist(directed_graph)
df = cudf.DataFrame(edge_list)

cu_graph = cugraph.from_cudf_edgelist(df, source='source', destination='target')

dataframe, modularity_score = cugraph.louvain(cu_graph,resolution=0.0318184327334165538425649) #this number is achieved after more than 200 runs that create 41 clusters

In [None]:
df = dataframe.to_pandas()

In [None]:
dump(df, 'Checkpoints/louvain_community.joblib')

In [21]:
dataframe = load('Checkpoints/louvain_community.joblib')

In [22]:
smallest_community_index = 0
smallest_community_len = 2000000
for index in range(41):
  if len(dataframe[dataframe['partition']==index]) <= smallest_community_len:
    smallest_community_len = len(dataframe[dataframe['partition']==index])
    smallest_community_index = index
print(f'Smallest community have {smallest_community_len} nodes.')

Smallest community have 9 nodes.


I tried more than 200 times to get resolution that give me only 40 communities but did not find that value and get 41 communities.
As we have 41 communities and we want only 40 we can ignore the smallest community to have 40 communities because true detection of them will not have any impact on total accuracy(9 data in more than 160000 data either false or true detection will not have any affect). 

In [23]:
louvain_communities_y = []
for index in range(41):
    if index != smallest_community_index:
        temp = set(dataframe[dataframe['partition']==index].index)
        louvain_communities_y.append(temp)

In [24]:
for i in louvain_communities_y:
    print(i)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [25]:
print(f'The accuracy on louvain community detection using simple evaluator is {evaluator(y_true_set, louvain_communities_y):.3f}%.')
print(f'The accuracy on louvain community detection is using stable match based evaluator is {stable_match_evaluator(y_true_set, louvain_communities_y):.3f}%.')

The accuracy on louvain community detection using simple evaluator is 0.387%.
The accuracy on louvain community detection is using stable match based evaluator is 16.069%.


In [26]:
fluid_communities = nx.community.asyn_fluidc(undirected_graph, k=number_of_classes,max_iter=10000)
fluid_communities

<dict_valueiterator at 0x2af375676a0>

In [27]:
fluid_communities_y = []
for community in copy.deepcopy(fluid_communities):
    fluid_communities_y.append(community)
    print(community)

{163846, 32776, 8203, 65547, 163862, 57369, 122906, 81949, 163871, 32802, 8229, 57382, 98344, 90154, 24620, 73777, 24626, 32818, 114739, 32822, 8257, 57410, 139330, 122946, 49224, 90187, 65612, 75, 122958, 41044, 49237, 131159, 139354, 131162, 41053, 131169, 139361, 73827, 122982, 114797, 32878, 147571, 8308, 139382, 65660, 155785, 114825, 90250, 131216, 147605, 73877, 49306, 41114, 32927, 65696, 65697, 57513, 114857, 16560, 90295, 98488, 147642, 98492, 123073, 24771, 164036, 98503, 57547, 73937, 123094, 216, 65753, 139480, 98523, 73945, 147674, 82148, 131301, 98533, 8422, 164070, 234, 236, 16620, 41199, 41200, 155889, 82171, 57595, 147710, 155903, 90370, 65795, 82180, 49415, 73994, 267, 57611, 16655, 49424, 164115, 139539, 90388, 57622, 139549, 74014, 114983, 57642, 24878, 65839, 114993, 82227, 65845, 16696, 314, 49479, 131400, 98634, 139598, 147791, 337, 16722, 90451, 131411, 8538, 139613, 82271, 131428, 98661, 115046, 362, 8554, 90479, 106864, 131442, 74106, 90491, 8572, 131453, 164

In [28]:
dump(fluid_communities_y, 'Checkpoints/fluid_community.joblib')

['Checkpoints/fluid_community.joblib']

In [29]:
fluid_communities_y = load('Checkpoints/fluid_community.joblib')

In [30]:
print(f'The accuracy on fluid community detection using simple evaluator is {evaluator(y_true_set, fluid_communities_y):.3f}%.')
print(f'The accuracy on fluid community detection is using stable match based evaluator is {stable_match_evaluator(y_true_set, fluid_communities_y):.3f}%.')

The accuracy on fluid community detection using simple evaluator is 13.095%.
The accuracy on fluid community detection is using stable match based evaluator is 16.284%.


# Node2Vec, DeepWalk

In [11]:
def Generator_to_pd_df(generator, feature_num, row_num):
    temp = []
    for i in range(row_num):
        temp.append(generator[str(i)])
    return pd.DataFrame(temp, columns=['feature'+str(i) for i in range(feature_num)])

In [12]:
workers = multiprocessing.cpu_count()
epochs = 5
vector_size = 128

low level node2vec

In [13]:
def node2vec(Graph, walk_length = 20, num_walks = 10, p = 1.0, q = 1.0):

    def node2vec_walk(walk_length, start_node):
        walk = [start_node]
        while len(walk) < walk_length:
            cur = walk[-1]
            cur_nbrs = list(Graph.neighbors(cur))
            if len(cur_nbrs) > 0:
                if len(walk) == 1:
                    walk.append(random.choice(cur_nbrs))
                else:
                    prev = walk[-2]
                    weights = []
                    for ngbr in cur_nbrs:
                        if prev == ngbr:
                            weights.append(1/p)
                        elif Graph.has_edge(prev, ngbr):
                            weights.append(1)
                        else:
                            weights.append(1/q)
                    normalized_weights = [weight/sum(weights) for weight in weights]
                    walk.append(random.choices(cur_nbrs, weights=normalized_weights, k=1)[0])
            else:
                break

        return [str(node) for node in walk]

    walks = []
    for node in Graph.nodes():
        for _ in range(num_walks):
            walks.append(node2vec_walk(walk_length, node))
    return walks

## For directed graph

In [None]:
walks1 = node2vec(Graph=directed_graph, walk_length=70, num_walks=30, p=1, q=0.5)
node2vec_model_DFS = Word2Vec(sentences=walks1, vector_size=vector_size, window=20, min_count=0, sg=1, workers=workers, epochs=epochs)

In [28]:
print(node2vec_model_DFS.wv['0'])

[ 0.3618355   0.31277537 -0.4755639  -0.08335983  0.00265471 -0.79215395
  0.31963694 -0.16022702 -0.74719393 -0.39980388 -0.11406987  0.8696896
 -0.45733514  1.2815974   0.5228082   0.79820967  0.15179142  0.3228652
  1.341268   -0.46983668 -0.21036007 -0.25791186 -0.5356081  -1.3429608
 -0.1293629  -0.30540472  0.24603392  0.92826927  0.11583386 -0.03252866
  0.35372978  0.21079504  0.71057934  0.8659291  -0.9183807  -0.9769473
  0.5989792   0.7141681  -0.49395975 -0.00518235  0.27203068  0.40553358
 -0.29232922 -0.1800393   0.35435113  0.2268803  -0.53365624  1.2358847
  0.31245607 -0.52836967  0.6573103  -0.18957156  0.2365285   0.44327495
  0.8901398  -0.5689234   0.6961119   0.4703043  -0.43575427  0.7968273
 -0.2732962   0.36469662 -0.49640042 -0.4702065  -1.1527431  -0.23703143
 -0.95520943 -0.51026505 -0.01281628 -0.67344505  1.0776792  -1.1094524
 -0.21910977  0.53031266  0.9669332  -1.3799552  -0.09792039 -0.62756526
 -0.11892168  0.6517377  -0.01515827 -0.8194869  -0.411608

In [29]:
walks2 = node2vec(Graph=directed_graph, walk_length=70, num_walks=30, p=0.3, q=1)
node2vec_model_BFS = Word2Vec(sentences=walks2, vector_size=vector_size, window=20, min_count=0, sg=1, workers=workers, epochs=epochs)

In [30]:
print(node2vec_model_BFS.wv['0'])

[-0.00282054 -0.42913654  0.23473768  0.5577718   0.20688197 -0.00386834
 -0.21997195 -0.4131534  -0.21845512  0.19886677 -0.38724202 -0.05165048
  0.26104736 -0.37673226  1.0159112   1.2076563  -0.41240707  0.6334394
 -0.23333935  0.62159926 -0.7340297  -0.48516175  0.4225471   1.1531928
  0.3839452  -0.6897094   1.6582811  -0.46074256  0.14539668 -0.5142858
  0.3589683   0.21291044 -0.06841409  0.5319282   0.9730928   0.0770557
  0.67287123  0.13563776  1.1595075  -0.2531428   0.3578703   1.0939918
  0.0863605  -0.42412075  0.0940858  -0.2979414  -0.20687464  0.29190168
  0.01093795  1.4231689  -0.4285402   0.07923388  0.6797121   0.6811012
  0.7854584   0.40438312 -0.35983312  0.7287127   0.3360275   0.4895564
 -0.17971113  0.9266845  -0.2670595   1.044583    0.1604933  -0.49581835
  1.1708965  -0.02882608  0.76624614 -0.48353353  0.22396103  0.93369454
 -0.24654946 -0.05245344  0.45429245  0.65769804 -0.5591865   0.4244511
  0.44166768  0.731565   -1.0400044  -1.5297375   0.6370662

In [43]:
node2vec_model_BFS = Generator_to_pd_df(generator=node2vec_model_BFS.wv, feature_num=vector_size, row_num=directed_graph.number_of_nodes())

In [44]:
node2vec_model_BFS

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature118,feature119,feature120,feature121,feature122,feature123,feature124,feature125,feature126,feature127
0,-0.002821,-0.429137,0.234738,0.557772,0.206882,-0.003868,-0.219972,-0.413153,-0.218455,0.198867,...,0.906374,0.155368,0.207169,-1.339724,0.989502,-0.358717,-0.477130,0.098326,0.467705,-1.630704
1,-0.041743,-0.063587,0.104762,-0.211954,-0.156650,-0.016618,0.016864,0.019365,0.030213,-0.097968,...,0.086536,-0.059101,0.067968,0.089140,0.252418,0.257183,0.104778,-0.244282,0.248880,0.054760
2,0.016108,-0.543910,0.398740,-0.288251,0.194660,0.010385,0.477104,-0.221556,-0.011532,0.207765,...,0.301393,0.202919,0.315969,0.214791,0.441377,0.576622,0.103335,-0.213838,-0.170758,0.217037
3,-0.054510,0.050624,0.352705,-0.300716,-0.077866,-0.345784,0.096502,0.184500,0.300832,0.149310,...,0.190717,-0.242583,0.206741,-0.042302,0.853756,0.022615,0.224304,0.018481,0.186670,0.006428
4,-0.255869,-0.042148,0.133186,0.562797,-0.464918,0.220038,-0.631707,0.402156,-0.368832,0.132827,...,0.157697,0.278310,0.371269,-0.471940,0.206970,0.309458,0.077428,-0.626850,-0.368372,0.374220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169338,0.292032,-0.221887,0.230832,-0.068698,-0.268514,-0.170798,-0.172078,0.185109,-0.207600,-0.219378,...,-0.038577,0.260036,0.099110,-0.244584,0.317599,-0.090661,-0.000869,-0.385825,0.184601,-0.498295
169339,0.457587,0.061041,0.414503,-0.211737,-0.279143,-0.098680,0.376427,-0.188354,-0.246246,-0.150676,...,0.132655,0.139972,0.066173,-0.028153,-0.149167,0.010430,0.148903,-0.401151,0.196434,-0.103438
169340,-0.021760,-0.076627,0.147916,-0.104824,-0.183095,-0.092175,-0.185579,0.135532,0.089108,0.034251,...,-0.253743,-0.118964,-0.017226,-0.328976,0.279236,-0.355428,0.048222,-0.255167,-0.171442,-0.268275
169341,-0.160452,-0.150423,0.577253,-0.288902,-0.063919,-0.206402,-0.181245,0.196241,-0.165448,-0.435801,...,0.285168,0.405765,0.048382,-0.244767,0.273636,-0.059458,-0.320429,-0.407717,0.231621,-0.533215


In [45]:
node2vec_model_DFS = Generator_to_pd_df(generator=node2vec_model_DFS.wv, feature_num=vector_size, row_num=directed_graph.number_of_nodes())

In [47]:
dump(node2vec_model_DFS, './Checkpoints/node2vec_dfs.joblib')
dump(node2vec_model_BFS, './Checkpoints/node2vec_bfs.joblib')

['./Checkpoints/node2vec_bfs.joblib']

In [36]:
node2vec_model_DFS = load('./Checkpoints/node2vec_dfs.joblib')
node2vec_model_BFS = load('./Checkpoints/node2vec_bfs.joblib')

low level deep walk

In [14]:
def deepwalk(Graph, walk_length=20, num_walks=10):
    def deepwalk_walk(walk_length, start_node):
        walk = [start_node]
        for _ in range(walk_length-1):
            neighbors = list(Graph.neighbors(walk[-1]))
            if len(neighbors) > 0:
                walk.append(random.choice(neighbors))
            else:
                break
        return [str(node) for node in walk]

    walks = []
    for node in Graph.nodes():
        for _ in range(num_walks):
            walks.append(deepwalk_walk(walk_length, node))
    return walks

In [21]:
walks = deepwalk(Graph = directed_graph, walk_length=70, num_walks=30)
deepwalk_model = Word2Vec(sentences=walks, vector_size=128, window=20, min_count=0, sg=1, workers=workers, epochs=epochs)

In [50]:
deepwalk_model = Generator_to_pd_df(generator=deepwalk_model.wv, feature_num=vector_size, row_num=directed_graph.number_of_nodes())

In [51]:
dump(deepwalk_model, './Checkpoints/deepwalk.joblib')

['./Checkpoints/deepwalk.joblib']

In [35]:
deepwalk_model = load('./Checkpoints/deepwalk.joblib')

In [54]:
deepwalk_model

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature118,feature119,feature120,feature121,feature122,feature123,feature124,feature125,feature126,feature127
0,-0.241847,0.096599,0.241295,0.242957,0.321001,-0.560462,0.180110,0.378661,0.064861,-0.656584,...,0.481708,-0.164403,0.545604,0.346555,0.419157,0.539219,0.973689,-0.187731,0.543268,-0.091685
1,-0.064449,-0.271093,-0.390161,-0.341793,0.096638,0.254692,0.355645,-0.065357,0.425353,-0.103742,...,0.050001,-0.021448,0.356301,-0.108517,-0.140924,0.091390,0.206008,0.234951,0.046466,0.183394
2,-0.118329,0.130557,-0.343994,-0.178396,0.285168,0.195016,0.280647,-0.205818,0.150757,-0.262743,...,0.209570,0.928023,0.385056,-0.023704,0.068145,-0.175036,0.065249,0.141780,0.178247,-0.070960
3,0.044588,-0.118867,-0.419816,0.101149,0.135693,0.400265,-0.151116,0.203600,0.184708,-0.211585,...,-0.413865,0.010215,0.521672,-0.362477,0.015102,-0.173421,0.232902,-0.217029,-0.259490,-0.134992
4,-0.182581,0.526455,-0.133748,-0.299038,-0.046164,0.513733,-0.435238,0.458256,0.288860,-0.556735,...,-0.158606,-0.088680,0.339329,-0.050461,0.428261,-0.249821,0.212277,0.059242,0.351712,0.032664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169338,0.104741,-0.224012,-0.057725,-0.171450,-0.174875,-0.002865,-0.128008,-0.098144,0.457209,-0.046921,...,0.081107,-0.043259,0.113728,-0.073810,-0.143804,0.114949,0.115929,0.226719,0.290666,0.179626
169339,-0.259733,-0.028448,-0.137509,-0.099832,0.248975,-0.196829,0.137114,-0.137850,0.236412,0.168402,...,0.115432,0.081350,0.466032,-0.161815,-0.110489,0.090441,-0.018700,0.576801,0.047327,0.105868
169340,-0.125420,0.290814,-0.020764,-0.435389,-0.176673,0.016224,-0.216484,0.173718,0.198298,-0.262008,...,-0.065220,0.218444,0.556012,-0.258921,0.033628,-0.445385,-0.175524,0.196412,0.137260,-0.051317
169341,0.203137,-0.016587,-0.048261,-0.069132,-0.093208,-0.183388,-0.124190,0.104700,0.453176,-0.269166,...,0.213849,-0.210950,0.209616,-0.127291,-0.035141,-0.225360,-0.017706,-0.097524,0.702476,0.163914


# Deep learning on graphs

In [9]:
split_idx = dataset.get_idx_split()
train_idx = split_idx['train']
epochs = 200

In [10]:
def train(model, data, train_idx, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.adj_t)[train_idx]
    loss = F.nll_loss(out, data.y.squeeze(1)[train_idx])
    loss.backward()
    optimizer.step()
    return loss.item()

In [11]:
@torch.no_grad()
def test(model, data, split_idx, evaluator):
    model.eval()

    out = model(data.x, data.adj_t)
    y_pred = out.argmax(dim=-1, keepdim=True)

    train_acc = evaluator.eval({
        'y_true': data.y[split_idx['train']],
        'y_pred': y_pred[split_idx['train']],
    })['acc']
    valid_acc = evaluator.eval({
        'y_true': data.y[split_idx['valid']],
        'y_pred': y_pred[split_idx['valid']],
    })['acc']
    test_acc = evaluator.eval({
        'y_true': data.y[split_idx['test']],
        'y_pred': y_pred[split_idx['test']],
    })['acc']

    return train_acc, valid_acc, test_acc

In [12]:
class GCN(torch.nn.Module):

    def __init__(self, input_size, hidden_layer_size, output_size, layer_count, dropout_probability):

        super(GCN, self).__init__()

        self.convolutions = torch.nn.ModuleList()
        self.batch_normalizations = torch.nn.ModuleList()
        self.dropout_probability = dropout_probability

        self.convolutions.append(GCNConv(input_size, hidden_layer_size, cached=True))
        self.batch_normalizations.append(torch.nn.BatchNorm1d(hidden_layer_size))

        for _ in range(layer_count - 2):
            self.convolutions.append(GCNConv(hidden_layer_size, hidden_layer_size, cached=True))
            self.batch_normalizations.append(torch.nn.BatchNorm1d(hidden_layer_size))
        
        self.convolutions.append(GCNConv(hidden_layer_size, output_size, cached=True))


    def reset_parameters(self):
        for convolution in self.convolutions:
            convolution.reset_parameters()
        for norm in self.batch_normalizations:
            norm.reset_parameters()
    
    def forward(self, data, adj_t):
        for index, convolution in enumerate(self.convolutions[:-1]):
            data = convolution(data, adj_t)
            data = self.batch_normalizations[index](data)
            data = F.relu(data)
            data = F.dropout(data, p=self.dropout_probability, training=self.training)
        data = self.convolutions[-1](data, adj_t)
        return data.log_softmax(dim=-1)


In [13]:
class GraphSage(torch.nn.Module):
    
    def __init__(self, input_size, hidden_layer_size, output_size, layer_count, dropout_probability):

        super(GraphSage, self).__init__()

        self.convolutions = torch.nn.ModuleList()
        self.batch_normalizations = torch.nn.ModuleList()
        self.dropout_probability = dropout_probability

        self.convolutions.append(SAGEConv(input_size, hidden_layer_size, cached=True))
        self.batch_normalizations.append(torch.nn.BatchNorm1d(hidden_layer_size))

        for _ in range(layer_count - 2):
            self.convolutions.append(SAGEConv(hidden_layer_size, hidden_layer_size, cached=True))
            self.batch_normalizations.append(torch.nn.BatchNorm1d(hidden_layer_size))
        
        self.convolutions.append(SAGEConv(hidden_layer_size, output_size, cached=True))

    def reset_parameters(self):
        for convolution in self.convolutions:
            convolution.reset_parameters()
        for norm in self.batch_normalizations:
            norm.reset_parameters()

    def forward(self, data, adj_t):
        for index, convolution in enumerate(self.convolutions[:-1]):
            data = convolution(data, adj_t)
            data = self.batch_normalizations[index](data)
            data = F.relu(data)
            data = F.dropout(data, p=self.dropout_probability, training=self.training)
        data = self.convolutions[-1](data, adj_t)
        return data.log_softmax(dim=-1)

In [15]:
dataset = PygNodePropPredDataset(name='ogbn-arxiv',
                                 transform=T.ToSparseTensor())
data = dataset[0]
data = data.to(device)
split_idx = dataset.get_idx_split()
train_idx = split_idx['train'].to(device)

In [16]:
torch.cuda.empty_cache()

In [27]:
GCN_acc = []
model = GCN(data.num_features, 256, dataset.num_classes, 4, 0.3).to(device)
evaluator = Evaluator(name='ogbn-arxiv')
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
for epoch in range(1, epochs):
    loss = train(model, data, train_idx, optimizer)
    result = test(model, data, split_idx, evaluator)
    train_acc, valid_acc, test_acc = result
    GCN_acc.append(100 * test_acc)
    print(f'Epoch: {epoch:02d}, '
          f'Train: {100 * train_acc:.2f}%, '
          f'Valid: {100 * valid_acc:.2f}% '
          f'Test: {100 * test_acc:.2f}%')

Epoch: 01, Train: 8.64%, Valid: 8.02% Test: 6.44%
Epoch: 02, Train: 23.02%, Valid: 19.90% Test: 17.96%
Epoch: 03, Train: 29.13%, Valid: 30.40% Test: 26.35%
Epoch: 04, Train: 30.98%, Valid: 30.55% Test: 24.80%
Epoch: 05, Train: 28.67%, Valid: 26.66% Test: 23.01%
Epoch: 06, Train: 28.48%, Valid: 26.86% Test: 24.25%
Epoch: 07, Train: 29.65%, Valid: 29.72% Test: 27.37%
Epoch: 08, Train: 31.66%, Valid: 32.90% Test: 31.44%
Epoch: 09, Train: 34.64%, Valid: 36.48% Test: 35.76%
Epoch: 10, Train: 38.24%, Valid: 40.30% Test: 39.53%
Epoch: 11, Train: 41.81%, Valid: 44.15% Test: 42.60%
Epoch: 12, Train: 45.14%, Valid: 47.03% Test: 44.55%
Epoch: 13, Train: 47.75%, Valid: 49.16% Test: 45.90%
Epoch: 14, Train: 49.77%, Valid: 50.71% Test: 46.57%
Epoch: 15, Train: 51.33%, Valid: 51.43% Test: 46.69%
Epoch: 16, Train: 52.38%, Valid: 51.85% Test: 46.68%
Epoch: 17, Train: 53.08%, Valid: 52.07% Test: 46.41%
Epoch: 18, Train: 53.45%, Valid: 51.71% Test: 45.91%
Epoch: 19, Train: 53.69%, Valid: 51.40% Test: 45.

After some hyper parameter tunning I got the better parameters that my GCN works better.

In [28]:
scatter = go.Scatter(x=[index for index in range(epochs)], y=GCN_acc, mode="markers+lines", hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%")
fig = go.Figure(scatter)
fig.update_layout(
title="Test accuracy over epochs using GCN ",
xaxis_title="Epoch",
yaxis_title="Test accuracy",
template='plotly_dark'
)
fig.show()

In [31]:
dump(GCN_acc, './Checkpoints/GCN_acc.joblib')

['./Checkpoints/GCN_acc.joblib']

In [17]:
SAGE_acc = []
model = GraphSage(data.num_features, 256, dataset.num_classes, 4, 0.3).to(device)

evaluator = Evaluator(name='ogbn-arxiv')

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
for epoch in range(1, epochs):
    loss = train(model, data, train_idx, optimizer)
    result = test(model, data, split_idx, evaluator)
    train_acc, valid_acc, test_acc = result
    SAGE_acc.append(100 * test_acc)
    print(f'Epoch: {epoch:02d} ', f'Loss: {loss:.4f} ', f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ', f'Test: {100 * test_acc:.2f}%')


Epoch: 01  Loss: 3.8003  Train: 18.57%, Valid: 23.93%  Test: 24.54%
Epoch: 02  Loss: 2.9666  Train: 13.53%, Valid: 21.06%  Test: 28.84%
Epoch: 03  Loss: 2.7461  Train: 24.63%, Valid: 24.27%  Test: 24.08%
Epoch: 04  Loss: 2.5267  Train: 26.55%, Valid: 27.78%  Test: 27.39%
Epoch: 05  Loss: 2.3900  Train: 32.90%, Valid: 37.41%  Test: 34.61%
Epoch: 06  Loss: 2.2317  Train: 34.19%, Valid: 38.22%  Test: 36.60%
Epoch: 07  Loss: 2.1427  Train: 35.12%, Valid: 39.20%  Test: 36.70%
Epoch: 08  Loss: 2.0537  Train: 38.68%, Valid: 42.74%  Test: 37.60%
Epoch: 09  Loss: 1.9724  Train: 41.38%, Valid: 44.52%  Test: 38.15%
Epoch: 10  Loss: 1.9200  Train: 42.61%, Valid: 45.65%  Test: 39.19%
Epoch: 11  Loss: 1.8821  Train: 43.64%, Valid: 46.92%  Test: 41.17%
Epoch: 12  Loss: 1.8452  Train: 45.03%, Valid: 48.59%  Test: 43.34%
Epoch: 13  Loss: 1.7979  Train: 46.76%, Valid: 49.75%  Test: 45.08%
Epoch: 14  Loss: 1.7617  Train: 48.97%, Valid: 51.08%  Test: 46.23%
Epoch: 15  Loss: 1.7358  Train: 51.40%, Valid: 5

After some hyper parameter tunning I got the better parameters that my Graph SAGE works better.

In [18]:
scatter = go.Scatter(x=[index for index in range(epochs)], y=SAGE_acc, mode="markers+lines", hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%")
fig = go.Figure(scatter)
fig.update_layout(
title="Test accuracy over epochs using Graph SAGE",
xaxis_title="Epoch",
yaxis_title="Test accuracy",
template='plotly_dark'
)
fig.show()

In [19]:
dump(SAGE_acc, './Checkpoints/SAGE_acc.joblib')

['./Checkpoints/SAGE_acc.joblib']

I have also implemented GAT but due to low memory on GPU, I can not run :(

## Deep Learning on undirected graphs

In [20]:
dataset = PygNodePropPredDataset(name='ogbn-arxiv',
                                 transform=T.ToSparseTensor())
data = dataset[0]
data.adj_t = data.adj_t.to_symmetric()
data = data.to(device)
split_idx = dataset.get_idx_split()
train_idx = split_idx['train'].to(device)

In [21]:
GCN_acc_undirected = []
model = GCN(data.num_features, 256, dataset.num_classes, 4, 0.3).to(device)
evaluator = Evaluator(name='ogbn-arxiv')
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
for epoch in range(1, epochs):
    loss = train(model, data, train_idx, optimizer)
    result = test(model, data, split_idx, evaluator)
    train_acc, valid_acc, test_acc = result
    GCN_acc_undirected.append(100 * test_acc)
    print(f'Epoch: {epoch:02d}, '
          f'Train: {100 * train_acc:.2f}%, '
          f'Valid: {100 * valid_acc:.2f}% '
          f'Test: {100 * test_acc:.2f}%')

Epoch: 01, Train: 24.63%, Valid: 28.46% Test: 25.47%
Epoch: 02, Train: 31.89%, Valid: 32.25% Test: 28.50%
Epoch: 03, Train: 35.63%, Valid: 35.48% Test: 33.08%
Epoch: 04, Train: 35.10%, Valid: 26.02% Test: 23.58%
Epoch: 05, Train: 33.94%, Valid: 24.79% Test: 22.81%
Epoch: 06, Train: 29.93%, Valid: 22.23% Test: 20.10%
Epoch: 07, Train: 26.77%, Valid: 19.88% Test: 17.51%
Epoch: 08, Train: 26.64%, Valid: 20.02% Test: 18.06%
Epoch: 09, Train: 28.97%, Valid: 24.48% Test: 23.69%
Epoch: 10, Train: 31.98%, Valid: 31.32% Test: 32.22%
Epoch: 11, Train: 34.72%, Valid: 37.79% Test: 38.84%
Epoch: 12, Train: 37.21%, Valid: 41.84% Test: 42.80%
Epoch: 13, Train: 40.06%, Valid: 44.62% Test: 44.95%
Epoch: 14, Train: 43.91%, Valid: 46.68% Test: 46.10%
Epoch: 15, Train: 46.80%, Valid: 47.75% Test: 46.10%
Epoch: 16, Train: 48.38%, Valid: 48.06% Test: 45.64%
Epoch: 17, Train: 49.12%, Valid: 48.44% Test: 45.70%
Epoch: 18, Train: 49.58%, Valid: 49.28% Test: 46.98%
Epoch: 19, Train: 49.71%, Valid: 49.93% Test: 

In [22]:
scatter = go.Scatter(x=[index for index in range(epochs)], y=GCN_acc_undirected, mode="markers+lines", hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%")
fig = go.Figure(scatter)
fig.update_layout(
title="Test accuracy over epochs using GCN on Undirected Graph",
xaxis_title="Epoch",
yaxis_title="Test accuracy",
template='plotly_dark'
)
fig.show()

In [23]:
dump(GCN_acc_undirected, './Checkpoints/GCN_acc_Undirected.joblib')

['./Checkpoints/GCN_acc_Undirected.joblib']

In [24]:
SAGE_acc_undirected = []
model = GraphSage(data.num_features, 256, dataset.num_classes, 4, 0.3).to(device)

evaluator = Evaluator(name='ogbn-arxiv')

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
for epoch in range(1, epochs):
    loss = train(model, data, train_idx, optimizer)
    result = test(model, data, split_idx, evaluator)
    train_acc, valid_acc, test_acc = result
    SAGE_acc_undirected.append(100 * test_acc)
    print(f'Epoch: {epoch:02d} ', f'Loss: {loss:.4f} ', f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ', f'Test: {100 * test_acc:.2f}%')


Epoch: 01  Loss: 3.7954  Train: 30.16%, Valid: 34.06%  Test: 37.11%
Epoch: 02  Loss: 2.4865  Train: 18.53%, Valid: 28.17%  Test: 26.40%
Epoch: 03  Loss: 2.8481  Train: 36.19%, Valid: 40.65%  Test: 39.67%
Epoch: 04  Loss: 2.0184  Train: 39.71%, Valid: 45.71%  Test: 46.70%
Epoch: 05  Loss: 1.9517  Train: 38.79%, Valid: 40.64%  Test: 38.47%
Epoch: 06  Loss: 1.8329  Train: 40.93%, Valid: 43.50%  Test: 41.52%
Epoch: 07  Loss: 1.7217  Train: 45.75%, Valid: 50.42%  Test: 49.65%
Epoch: 08  Loss: 1.6253  Train: 50.32%, Valid: 56.56%  Test: 57.85%
Epoch: 09  Loss: 1.5649  Train: 53.85%, Valid: 58.74%  Test: 60.52%
Epoch: 10  Loss: 1.5142  Train: 55.89%, Valid: 60.15%  Test: 61.07%
Epoch: 11  Loss: 1.4776  Train: 56.69%, Valid: 60.36%  Test: 60.47%
Epoch: 12  Loss: 1.4469  Train: 56.92%, Valid: 60.35%  Test: 60.11%
Epoch: 13  Loss: 1.4112  Train: 56.95%, Valid: 60.54%  Test: 60.14%
Epoch: 14  Loss: 1.3802  Train: 57.56%, Valid: 61.08%  Test: 60.57%
Epoch: 15  Loss: 1.3523  Train: 58.63%, Valid: 6

In [25]:
scatter = go.Scatter(x=[index for index in range(epochs)], y=SAGE_acc_undirected, mode="markers+lines", hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%")
fig = go.Figure(scatter)
fig.update_layout(
title="Test accuracy over epochs using Graph SAGE on Undirected Graph",
xaxis_title="Epoch",
yaxis_title="Test accuracy",
template='plotly_dark'
)
fig.show()

In [26]:
dump(SAGE_acc_undirected, './Checkpoints/SAGE_acc_Undirected.joblib')

['./Checkpoints/SAGE_acc_Undirected.joblib']

# Clustering on node embeddings

kmans dbscan
with two modes use node feature and do not

In [31]:
def kmeans_on_embeddings(data, number_of_classes, evaluator):
    kmeans = KMeans(n_clusters=number_of_classes)
    model = kmeans.fit(data)
    labels = model.labels_
    y_pred = [set() for i in range(number_of_classes)]
    for node in range(len(data)):
        label = labels[node]
        y_pred[label].add(node)
    return evaluator(y_true_set, y_pred)

In [35]:
print(f'The accuracy of node2vec in BFS mode using kmeans is {kmeans_on_embeddings(node2vec_model_BFS, number_of_classes, stable_match_evaluator):.3f}%.')

The accuracy of node2vec in BFS mode using kmeans is 21.850%.


In [36]:
print(f'The accuracy of node2vec in DFS mode using kmeans is {kmeans_on_embeddings(node2vec_model_DFS, number_of_classes, stable_match_evaluator):.3f}%.')

The accuracy of node2vec in DFS mode using kmeans is 21.671%.


In [37]:
print(f'The accuracy of deepwalk using kmeans is {kmeans_on_embeddings(deepwalk_model, number_of_classes, stable_match_evaluator):.3f}%.')

The accuracy of deepwalk using kmeans is 21.164%.


## Deep learning on Embeddings

In [40]:
split_idx = dataset.get_idx_split()
train_idx = split_idx['train']
test_idx = split_idx['test']
epochs = 200

In [41]:
def train_NN(model, x, y, train_idx, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(x)[train_idx]
    loss = F.nll_loss(out, y.squeeze(1)[train_idx])
    loss.backward()
    optimizer.step()
    return loss.item()

In [42]:
@torch.no_grad()
def test_NN(model, x, y, split_idx, evaluator):
    model.eval()

    out = model(x)
    y_pred = out.argmax(dim=-1, keepdim=True)

    train_acc = evaluator.eval({
        'y_true': y[split_idx['train']],
        'y_pred': y_pred[split_idx['train']],
    })['acc']
    valid_acc = evaluator.eval({
        'y_true': y[split_idx['valid']],
        'y_pred': y_pred[split_idx['valid']],
    })['acc']
    test_acc = evaluator.eval({
        'y_true': y[split_idx['test']],
        'y_pred': y_pred[split_idx['test']],
    })['acc']

    return train_acc, valid_acc, test_acc

In [43]:
class NeuralNetwork(torch.nn.Module):

    def __init__(self, input_size, hidden_layer_sizes, output_size, layer_count, dropout_probability):

        super(NeuralNetwork, self).__init__()

        self.layers = torch.nn.ModuleList()
        self.batch_normalizations = torch.nn.ModuleList()
        self.dropout_probability = dropout_probability
        self.leaky_relu = torch.nn.LeakyReLU(negative_slope=0.01)

        self.layers.append(torch.nn.Linear(input_size, hidden_layer_sizes[0]))
        self.batch_normalizations.append(torch.nn.BatchNorm1d(hidden_layer_sizes[0]))

        for index in range(layer_count - 3):
            self.layers.append(torch.nn.Linear(hidden_layer_sizes[index], hidden_layer_sizes[index+1]))
            self.batch_normalizations.append(torch.nn.BatchNorm1d(hidden_layer_sizes[index+1]))
        
        self.layers.append(torch.nn.Linear(hidden_layer_sizes[-1], output_size))


    def reset_parameters(self):
        for layer in self.layers:
            layer.reset_parameters()
        for norm in self.batch_normalizations:
            norm.reset_parameters()
    
    def forward(self, data):
        for index, layer in enumerate(self.layers[:-1]):
            data = layer(data)
            data = self.leaky_relu(data)
            data = self.batch_normalizations[index](data)
            data = F.dropout(data, p=self.dropout_probability, training=self.training)
        data = self.layers[-1](data)
        return data.log_softmax(dim=-1)


In [44]:
def NN_evaluation(x, y, input_size, hidden_layer_sizes, output_layer_size, number_of_layers, dropout, learning_rate):
    NeuralNetwork_acc = []

    model = NeuralNetwork(input_size, hidden_layer_sizes, output_layer_size, number_of_layers, dropout_probability=dropout).to(device)
    evaluator = Evaluator(name='ogbn-arxiv')
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    for epoch in range(1, epochs):
        loss = train_NN(model, x, y, train_idx, optimizer)
        result = test_NN(model, x, y, split_idx, evaluator)
        train_acc, valid_acc, test_acc = result
        NeuralNetwork_acc.append(100 * test_acc)
        print(f'Epoch: {epoch:02d}, '
              f'Train: {100 * train_acc:.2f}%, '
              f'Valid: {100 * valid_acc:.2f}% '
              f'Test: {100 * test_acc:.2f}%')
    return NeuralNetwork_acc

In [45]:
deepwalk_acc = NN_evaluation( torch.tensor(deepwalk_model.values).to(device),(data.y).to(device), vector_size, [256,256], number_of_classes, 4, 0.3, 0.01)

Epoch: 01, Train: 34.49%, Valid: 40.53% Test: 40.22%
Epoch: 02, Train: 46.34%, Valid: 48.29% Test: 46.09%
Epoch: 03, Train: 49.40%, Valid: 53.03% Test: 53.67%
Epoch: 04, Train: 50.63%, Valid: 54.68% Test: 56.48%
Epoch: 05, Train: 50.52%, Valid: 54.40% Test: 55.84%
Epoch: 06, Train: 50.22%, Valid: 53.06% Test: 53.74%
Epoch: 07, Train: 48.80%, Valid: 51.80% Test: 52.37%
Epoch: 08, Train: 48.33%, Valid: 51.77% Test: 52.79%
Epoch: 09, Train: 47.98%, Valid: 51.91% Test: 53.69%
Epoch: 10, Train: 47.28%, Valid: 51.73% Test: 54.11%
Epoch: 11, Train: 46.80%, Valid: 51.55% Test: 54.10%
Epoch: 12, Train: 46.72%, Valid: 51.55% Test: 54.22%
Epoch: 13, Train: 47.13%, Valid: 52.00% Test: 54.62%
Epoch: 14, Train: 48.09%, Valid: 52.62% Test: 55.27%
Epoch: 15, Train: 49.13%, Valid: 53.28% Test: 55.91%
Epoch: 16, Train: 50.02%, Valid: 53.71% Test: 56.31%
Epoch: 17, Train: 50.78%, Valid: 54.07% Test: 56.58%
Epoch: 18, Train: 51.29%, Valid: 54.41% Test: 56.87%
Epoch: 19, Train: 51.92%, Valid: 54.88% Test: 

In [46]:
scatter = go.Scatter(x=[index for index in range(epochs)], y=deepwalk_acc, mode="markers+lines", hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%")
fig = go.Figure(scatter)
fig.update_layout(
title="Test accuracy over epochs using Neural Network on DeepWalk Embeddings",
xaxis_title="Epoch",
yaxis_title="Test accuracy",
template='plotly_dark'
)
fig.show()

In [47]:
dump(deepwalk_acc, './Checkpoints/deepwalk_acc.joblib')

['./Checkpoints/deepwalk_acc.joblib']

In [48]:
node2vec_BFS_acc = NN_evaluation( torch.tensor(node2vec_model_BFS.values).to(device),(data.y).to(device), vector_size, [256,256], number_of_classes, 4, 0.3, 0.01)

Epoch: 01, Train: 34.16%, Valid: 41.27% Test: 41.96%
Epoch: 02, Train: 46.78%, Valid: 50.70% Test: 51.95%
Epoch: 03, Train: 50.22%, Valid: 53.53% Test: 55.45%
Epoch: 04, Train: 51.15%, Valid: 54.55% Test: 56.47%
Epoch: 05, Train: 50.55%, Valid: 54.41% Test: 55.48%
Epoch: 06, Train: 50.75%, Valid: 54.58% Test: 55.49%
Epoch: 07, Train: 50.54%, Valid: 54.40% Test: 55.80%
Epoch: 08, Train: 49.36%, Valid: 53.78% Test: 55.54%
Epoch: 09, Train: 48.62%, Valid: 53.23% Test: 55.24%
Epoch: 10, Train: 48.32%, Valid: 52.84% Test: 54.95%
Epoch: 11, Train: 48.10%, Valid: 52.52% Test: 54.83%
Epoch: 12, Train: 48.20%, Valid: 52.45% Test: 54.95%
Epoch: 13, Train: 48.38%, Valid: 52.58% Test: 55.31%
Epoch: 14, Train: 48.59%, Valid: 52.94% Test: 55.66%
Epoch: 15, Train: 49.06%, Valid: 53.26% Test: 55.90%
Epoch: 16, Train: 49.84%, Valid: 53.69% Test: 56.19%
Epoch: 17, Train: 50.78%, Valid: 54.18% Test: 56.45%
Epoch: 18, Train: 51.45%, Valid: 54.59% Test: 56.80%
Epoch: 19, Train: 51.81%, Valid: 54.91% Test: 

In [49]:
scatter = go.Scatter(x=[index for index in range(epochs)], y=node2vec_BFS_acc, mode="markers+lines", hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%")
fig = go.Figure(scatter)
fig.update_layout(
title="Test accuracy over epochs using Neural Network on Node2Vec on BFS mode Embeddings",
xaxis_title="Epoch",
yaxis_title="Test accuracy",
template='plotly_dark'
)
fig.show()

In [50]:
dump(node2vec_model_BFS, './Checkpoints/node2vec_BFS_acc.joblib')

['./Checkpoints/node2vec_BFS_acc.joblib']

In [51]:
node2vec_DFS_acc = NN_evaluation( torch.tensor(node2vec_model_DFS.values).to(device),(data.y).to(device), vector_size, [256,256], number_of_classes, 4, 0.3, 0.01)

Epoch: 01, Train: 37.30%, Valid: 44.04% Test: 44.36%
Epoch: 02, Train: 46.18%, Valid: 50.96% Test: 51.61%
Epoch: 03, Train: 47.08%, Valid: 52.84% Test: 55.14%
Epoch: 04, Train: 50.42%, Valid: 53.23% Test: 55.83%
Epoch: 05, Train: 51.26%, Valid: 53.63% Test: 56.04%
Epoch: 06, Train: 51.19%, Valid: 54.35% Test: 56.06%
Epoch: 07, Train: 50.01%, Valid: 53.34% Test: 55.07%
Epoch: 08, Train: 48.39%, Valid: 52.07% Test: 54.41%
Epoch: 09, Train: 46.95%, Valid: 50.78% Test: 53.47%
Epoch: 10, Train: 46.20%, Valid: 49.99% Test: 52.70%
Epoch: 11, Train: 46.22%, Valid: 50.15% Test: 52.81%
Epoch: 12, Train: 46.64%, Valid: 50.87% Test: 53.56%
Epoch: 13, Train: 46.95%, Valid: 51.67% Test: 54.20%
Epoch: 14, Train: 47.69%, Valid: 52.45% Test: 54.89%
Epoch: 15, Train: 49.00%, Valid: 53.19% Test: 55.66%
Epoch: 16, Train: 50.55%, Valid: 53.93% Test: 56.50%
Epoch: 17, Train: 52.02%, Valid: 54.62% Test: 56.98%
Epoch: 18, Train: 53.19%, Valid: 55.50% Test: 57.40%
Epoch: 19, Train: 53.87%, Valid: 56.24% Test: 

In [52]:
scatter = go.Scatter(x=[index for index in range(epochs)], y=node2vec_DFS_acc, mode="markers+lines", hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%")
fig = go.Figure(scatter)
fig.update_layout(
title="Test accuracy over epochs using Neural Network on Node2Vec in DFS mode Embeddings",
xaxis_title="Epoch",
yaxis_title="Test accuracy",
template='plotly_dark'
)
fig.show()

In [53]:
dump(node2vec_model_DFS, './Checkpoints/node2vec_DFS_acc.joblib')

['./Checkpoints/node2vec_DFS_acc.joblib']

In [54]:
node2vec_model_DFS = load('./Checkpoints/node2vec_DFS_acc.joblib')
node2vec_model_BFS = load('./Checkpoints/node2vec_BFS_acc.joblib')
deepwalk_acc = load('./Checkpoints/deepwalk_acc.joblib')
SAGE_acc_undirected = load('./Checkpoints/SAGE_acc_Undirected.joblib')
GCN_acc_undirected = load('./Checkpoints/GCN_acc_Undirected.joblib')
SAGE_acc = load('./Checkpoints/SAGE_acc.joblib')
GCN_acc = load('./Checkpoints/GCN_acc.joblib')

In [70]:
scatter = go.Scatter(x=[index for index in range(epochs)], y=node2vec_DFS_acc, mode="markers+lines", hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%")
fig = go.Figure(scatter)
fig.update_layout(
title="Compare Different Methods we implemented",
xaxis_title="Epoch",
yaxis_title="Test accuracy",
template='plotly_dark'
)
fig['data'][0]['name'] = 'node2vec_DFS_acc'

fig.add_trace(go.Scatter(x=[index for index in range(epochs)], y=node2vec_BFS_acc, 
                        mode='lines+markers', name='node2vec_model_BFS',
                        line=dict(color='green', width=1, dash='dashdot'), 
                        hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%"))

fig.add_trace(go.Scatter(x=[index for index in range(epochs)], y=deepwalk_acc, 
                        mode='lines+markers', name='deepwalk_acc',
                        line=dict(color='white', width=1, dash='dashdot'), 
                        hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%"))

fig.add_trace(go.Scatter(x=[index for index in range(epochs)], y=SAGE_acc_undirected, 
                        mode='lines+markers', name='SAGE_acc_undirected',
                        line=dict(color='yellow', width=1, dash='dashdot'), 
                        hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%"))

fig.add_trace(go.Scatter(x=[index for index in range(epochs)], y=GCN_acc_undirected, 
                        mode='lines+markers', name='GCN_acc_undirected',
                        line=dict(color='purple', width=1, dash='dashdot'), 
                        hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%"))

fig.add_trace(go.Scatter(x=[index for index in range(epochs)], y=SAGE_acc, 
                        mode='lines+markers', name='SAGE_acc_directed',
                        line=dict(color='orange', width=1, dash='dashdot'), 
                        hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%"))

fig.add_trace(go.Scatter(x=[index for index in range(epochs)], y=GCN_acc, 
                        mode='lines+markers', name='GCN_acc_directed',
                        line=dict(color='gray', width=1, dash='dashdot'),
                        hovertemplate="Epoch: %{x}<br>Test Accuracy: %{y:.2f}%"))