In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
pip install -U node2vec gensim

In [None]:
from node2vec import Node2Vec
from sklearn.model_selection import train_test_split
import networkx as nx
import random
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load citation network
edges = []
with open("/content/drive/MyDrive/llms/edgelist.txt", "r") as f:
    for line in f:
        source, target = map(int, line.strip().split(","))
        edges.append((source, target))

# Create a directed graph
G = nx.DiGraph()
G.add_edges_from(edges)
print(G.number_of_nodes())
print(G.number_of_edges())

In [None]:
# # Load test pairs
test_pairs_kaggle = []
with open("/content/drive/MyDrive/llms/test.txt", "r") as f:
    for line in f:
        source, target = map(int, line.strip().split(","))
        test_pairs_kaggle.append((source, target))

In [None]:
random_state=42
random.seed(random_state)
np.random.seed(random_state)

test_ratio=0.80
val_ratio=0.10
test_ratio = 0.10

# Get all edges from original graph
all_edges = list(G.edges())
all_nodes = list(G.nodes())

print(f"Original graph - Nodes: {len(all_nodes)}, Edges: {len(all_edges)}")

# Split edges into test and train_val
train_val_edges, test_edges = train_test_split(
    all_edges,
    test_size=test_ratio,
    random_state=random_state
)

# Do this in order to have 80% training edges, 10% val and 10% test
val_size_relative_to_train_val = val_ratio / (1.0 - test_ratio)

train_edges, val_edges = train_test_split(
    train_val_edges,
    test_size=val_size_relative_to_train_val,
    random_state=random_state
)

print(f"Train edges: {len(train_edges)} ({len(train_edges)/len(all_edges)*100:.1f}%)")
print(f"Val edges: {len(val_edges)} ({len(val_edges)/len(all_edges)*100:.1f}%)")
print(f"Test edges: {len(test_edges)} ({len(test_edges)/len(all_edges)*100:.1f}%)")

# Create training graph
G_train = nx.DiGraph() if G.is_directed() else nx.Graph()
G_train.add_edges_from(train_edges)
G_train.add_nodes_from(all_nodes)

print(f"Training graph - Nodes: {G_train.number_of_nodes()}, Edges: {G_train.number_of_edges()}")

In [None]:
# settings of node2vec
node2vec = Node2Vec(G_train, dimensions=64, walk_length=100, num_walks=10, workers=1)

# other settings for node2vec
# node2vec = Node2Vec(G_train, dimensions=128, walk_length=30, num_walks=200, workers=4)

In [None]:
# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=100)

In [None]:
# Create a dictionary of node embeddings
node_embeddings = {}
for node in G_train.nodes():
  node_embeddings[node] = model.wv[str(node)]

In [None]:
import pickle

# Define the filename for the saved embeddings
embeddings_filename = "/content/drive/MyDrive/llms/node_embeddings.pkl"

# Save the dictionary to a file
with open(embeddings_filename, 'wb') as f:
    pickle.dump(node_embeddings, f)

print(f"Node embeddings saved to {embeddings_filename}")