# Import and install relavant packages and libraries

In [55]:
!pip install torch-geometric



#Link prediction

In [56]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import networkx as nx
import pandas as pd
import numpy as np


from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import CitationFull
from torch_geometric.datasets import HeterophilousGraphDataset
from torch_geometric.datasets import Reddit
from torch_geometric.datasets import PPI

from torch_geometric.utils import to_networkx, from_networkx

import torch
from torch_geometric.transforms import RandomLinkSplit

# Mount Google Drive (for usage in Google Colab)

In [57]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Usage for .csv data

In [58]:
# # File path
# folder_path = "/content/drive/MyDrive/NetMF_implementations/BlogCatalog-dataset/"

# # Load the data
# nodes_path = folder_path + "data/nodes.csv"
# edges_path = folder_path + "data/edges.csv"
# groups_path = folder_path + "data/groups.csv"
# group_edges_path = folder_path + "data/group-edges.csv"

# nodes_id = pd.read_csv(nodes_path, header=None, names=['id'])
# groups_id = pd.read_csv(groups_path, header=None, names=['group'])
# edges = pd.read_csv(edges_path, header=None, names=['id_1', 'id_2'])
# user_group_membership = pd.read_csv(group_edges_path, header=None, names=['id', 'group'])
# # Create a graph
# G_BC = nx.Graph()

# # Add nodes to the graph
# G_BC.add_nodes_from(nodes_id['id'])

# # Add edges to the graph
# G_BC.add_edges_from(edges[['id_1', 'id_2']].values)
# # Create a dictionary to store groups for each ID
# group_dict = {}

# # Populate the group_dict
# for _, row in user_group_membership.iterrows():
#     user_id = row['id']
#     group_id = row['group']

#     # Check if the user_id is already in the dictionary
#     if user_id in group_dict:
#         group_dict[user_id].append(group_id)
#     else:
#         group_dict[user_id] = [group_id]

# # Add group labels to the nodes
# for user_id, groups in group_dict.items():
#     nx.set_node_attributes(G_BC, {user_id: groups}, 'group_belonging')

# # Print basic graph information
# print("Number of nodes:", G_BC.number_of_nodes(), ' | ', "Number of edges:", G_BC.number_of_edges())

#Use for pytorch_geometric datasets

In [59]:
# LOAD DATASET
#dataset = Planetoid(root='/tmp/PubMed', name='PubMed') ## PUBMED
#dataset = CitationFull(root='/tmp/Cora', name='Cora')  ## CORA
dataset = HeterophilousGraphDataset(root="./", name='amazon_ratings') ## AMAZON RATINGS
#dataset = PPI("./")
data = dataset[0]
G_BC = to_networkx(data, to_undirected=True)

labels = []
for i in range(len(data.y)):
    l = torch.nonzero(data.y[i]).squeeze().numpy().tolist()
    labels.append(l)

group_dict = {i: labels[i] for i in range(len(labels))}

for user_id, groups in group_dict.items():
    nx.set_node_attributes(G_BC, {user_id: groups}, 'group_belonging')

print("Number of nodes:", G_BC.number_of_nodes())
print("Number of edges:", G_BC.number_of_edges())

Downloading https://github.com/yandex-research/heterophilous-graphs/raw/main/data/amazon_ratings.npz
Processing...
Done!


Number of nodes: 24492
Number of edges: 93050


# Functions for link prediction

In [60]:
def load_embedding(file):
    # Load NetMF embedding from .npy file
    return np.load(file)

def predict_link(u, v, embeddings):
    """
    Computes the normalized probability for an existing link between two nodes u and v based on the input
    embeddings.
    :param u: a node in the graph
    :param v: a node in the graph
    :param embeddings: trained embeddings
    :return: sigmoid normalized probability for the existence of a link
    """
    embedding1 = embeddings[u]
    embedding2 = embeddings[v]

    # Compute inner product (dot product)
    dot_product = np.dot(embedding1, embedding2)

    # Normalize by sigmoid function
    link_probability = 1/(1 + np.exp(-dot_product))
    return link_probability


def link_predictions(embeddings, edges, y_true):
    """
    Computes the ROC-AUC score for a given set of test edges based on the trained embeddings.
    :param embeddings: a models trained embeddings
    :param edges: test edges
    :param y_true: the labels for edges (1=true, 0=false)
    :return: the ROC-AUC score from predictions
    """
    predictions = []
    for edge in edges:
        predictions.append(predict_link(edge[0], edge[1], embeddings))
    return roc_auc_score(y_true, predictions)


def train_test_split_graph(G):
    """
    Splits a Graph into a test and train set randomly to 80-20. The test split is balanced with negative edges sampled from random vertex pairs that have no edges between them.
    While removing edges randomly, it makes sure that no vertex is isolated.
    :param G: a networkx graph to be split
    :return: the train-test split as torch geometrics graphs
    """
    data = from_networkx(G)
    data.y = data.group_belonging
    data.x = torch.arange(G.number_of_nodes()).unsqueeze(1)

    transform = RandomLinkSplit(num_val=0, num_test=0.5, is_undirected=True, add_negative_train_samples=False)
    train_data, _, test_data = transform(data)
    return train_data, test_data

# Load embedding from file, do link predictiona and print ROC-AUC score

In [61]:
#Generate embedding
embedding_file = '/content/drive/MyDrive/NetMF_implementations/output_amazon_ratings_large.npy'
embeddings = load_embedding(embedding_file)

In [63]:
# Split data
train_data, test_data = train_test_split_graph(G_BC)

# Prepare edges
test_edges = test_data.edge_label_index.numpy().T
y_true = test_data.edge_label.numpy()

# Calculate ROC-AUC
roc_auc = link_predictions(embeddings, test_edges, y_true)


In [64]:
roc_auc

0.9967745448949146