In [1]:
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
import torch
import random
import fasttext

In [2]:
# Now data contains:
# data.x: Node features
# data.edge_index: Graph connectivity
# data.y: Node labels
# data.train_mask/val_mask/test_mask: Masks for splitting the dataset

# Load the CORA dataset
dataset = Planetoid(root='../Cora', name='Cora')

# Get the data object
data = dataset[0]

print(f"Number of nodes: {data.num_nodes}")
print(f"Number of edges: {data.num_edges}")
print(f"Number of node features: {data.num_features}")
print(f"Number of classes: {dataset.num_classes}")

Number of nodes: 2708
Number of edges: 10556
Number of node features: 1433
Number of classes: 7


In [3]:
import networkx as nx

graph_nx = nx.Graph()
graph_nx.add_edges_from(data.edge_index.t().tolist())

In [4]:
list(graph_nx.neighbors(13))

[1701, 1810]

In [5]:
class DeepWalk:
    def __init__(self, graph, walk_length=80, walks_per_vertex=10):
        self.graph = graph
        self.walk_length = walk_length
        self.walks_per_vertex = walks_per_vertex

    def simulate_random_walks(self):
        random_walks = []
        for _ in range(self.walks_per_vertex):
            O = list(self.graph.nodes()).copy()
            random.shuffle(O)
            for vi in O:
                Wvi = self._random_walk(vi)
                random_walks.append(Wvi)
        return random_walks


    def _random_walk(self, start_node):
        walk = [start_node]
        while len(walk) < self.walk_length:
            neighbors = list(self.graph.neighbors(walk[-1]))
            if len(neighbors) == 0:
                break
            next_node = random.choice(neighbors)
            walk.append(next_node)
        return walk
    
    def train(self):
        sentences = self.simulate_random_walks()
        with open("walks.txt", "w") as f:
            for walk in sentences:
                f.write(" ".join(map(str, walk)) + "\n")
        
        self.model = fasttext.train_unsupervised(
            input='walks.txt',
            model='skipgram',
            dim=128,
            ws=5,
            epoch=10,
            minCount=0,
            neg=0,
            loss='hs',
            thread=4,
            minn=0, maxn=0  # no subwords; pure Word2Vec behavior
        )

    def get_embedding(self, node):
        # Get the embedding for a specific node
        return self.model.get_word_vector(str(node))
    
    def get_embeddings(self):
        # Get tensor embeddings for all nodes
        embeddings = []
        for node in range(self.graph.number_of_nodes()):
            embeddings.append(self.get_embedding(node))
        return torch.tensor(embeddings)
    

In [6]:
deepwalk_model = DeepWalk(graph_nx, walk_length=80, walks_per_vertex=10)
deepwalk_model.train()

Read 2M words
Number of words:  2709
Number of labels: 0
Progress: 100.0% words/sec/thread:  127507 lr:  0.000000 avg.loss:  3.684624 ETA:   0h 0m 0s% words/sec/thread:  127637 lr:  0.012114 avg.loss:  3.846483 ETA:   0h 0m10s


In [8]:
positional_embeddings = deepwalk_model.get_embeddings()
positional_embeddings.shape

torch.Size([2708, 128])

In [None]:

def RandomWalk(G, v, walk_length):
    walk = [v]
    while len(walk) < walk_length:
        neighbors = list(G.neighbors(v))
        if not neighbors:
            break
        v = random.choice(neighbors)
        walk.append(v)
    
    return walk

def deeepwalk(graph_nx, walk_length, walks_per_node):
    num_nodes = graph_nx.number_of_nodes()
    random_walks = []
    for i in range(walks_per_node):
        O = list(graph_nx.nodes()).copy()
        random.shuffle(O)
        for vi in O:
            Wvi = RandomWalk(graph_nx, vi, walk_length)
            random_walks.append(Wvi)
    return random_walks

In [6]:
sentences = deeepwalk(graph_nx, walk_length=80, walks_per_node=10)

In [7]:
len(sentences)

27080

In [10]:
# Save walks to a file
with open("walks.txt", "w") as f:
    for walk in sentences:
        f.write(" ".join(map(str, walk)) + "\n")

In [11]:
import fasttext

model = fasttext.train_unsupervised(
    input='walks.txt',
    model='skipgram',
    dim=128,
    ws=5,
    epoch=10,
    minCount=0,
    neg=0,
    loss='hs',
    thread=4,
    minn=0, maxn=0  # no subwords; pure Word2Vec behavior
)

# Get vector for node 2
vector = model.get_word_vector("2")
print(vector.shape)

Read 2M words
Number of words:  2709
Number of labels: 0
Progress:  99.5% words/sec/thread:  129125 lr:  0.000235 avg.loss:  3.681745 ETA:   0h 0m 0s

(128,)


Progress: 100.0% words/sec/thread:  129123 lr:  0.000000 avg.loss:  3.678913 ETA:   0h 0m 0s


In [12]:
model.get_word_vector("0")

array([ 0.420978  ,  0.95378906, -0.1145293 , -0.19586547, -1.111248  ,
       -0.12487523, -0.1074683 , -0.05572588, -0.3279791 ,  0.08617289,
        0.10043468, -0.17231548, -0.2068229 ,  0.16711785, -0.03114433,
        0.5348437 , -0.02211843,  0.3208544 ,  0.06389198,  0.23906097,
        0.12849578,  0.5572322 ,  0.11254846, -0.06604139, -0.19831258,
        0.25256032, -0.75322276, -0.22516152, -0.2011957 ,  0.03793957,
        0.15414782,  0.68152004,  0.49336407,  0.028886  , -0.11230045,
        0.73488563, -0.40916163,  0.47960782, -0.30062157, -0.17928393,
       -0.08143311,  0.42212301, -0.6491701 ,  0.545988  , -0.31842914,
       -0.45619762, -0.27366212,  0.08842868, -0.25591105,  0.00456483,
       -0.10767275,  0.5368203 , -0.2657282 ,  0.0553051 , -0.15006621,
       -0.14413643, -0.7105219 , -0.15023999, -0.72897005,  0.5058041 ,
        0.41935825,  0.52716   , -0.277185  , -0.40046233, -1.1755383 ,
        0.1299369 ,  0.09509454, -0.3268316 ,  0.16994508,  0.12

In [13]:
dir(model)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__firstlineno__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__static_attributes__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_labels',
 '_words',
 'bucket',
 'dim',
 'epoch',
 'f',
 'get_analogies',
 'get_dimension',
 'get_input_matrix',
 'get_input_vector',
 'get_label_id',
 'get_labels',
 'get_line',
 'get_meter',
 'get_nearest_neighbors',
 'get_output_matrix',
 'get_sentence_vector',
 'get_subword_id',
 'get_subwords',
 'get_word_id',
 'get_word_vector',
 'get_words',
 'is_quantized',
 'label',
 'labels',
 'loss',
 'lr',
 'lrUpdateRate',
 'maxn',
 'minCount',
 'minCountLabel',
 'minn',
 'neg',
 'predict',
 'pretrainedVectors',
 'quantize',
 'save

In [17]:
embedding_matrix = []
for i in range(data.num_nodes):
    embedding_matrix.append(model.get_word_vector(str(i)))
embedding_matrix = torch.tensor(embedding_matrix)
embedding_matrix.shape

  embedding_matrix = torch.tensor(embedding_matrix)


torch.Size([2708, 128])