In [98]:
from ogb.nodeproppred import NodePropPredDataset
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader

dataset = NodePropPredDataset(name="ogbn-arxiv")
graph, label = dataset[0]

Downloading http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip


Downloaded 0.08 GB: 100%|██████████| 81/81 [00:11<00:00,  7.16it/s]


Extracting dataset\arxiv.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<?, ?it/s]

Saving...





In [25]:
import pandas as pd

In [26]:
voyage_embeddings = pd.read_csv('../data/voyage_embeddings.csv',compression='gzip')

In [28]:
voyage_embeddings.head()

Unnamed: 0,paper id,title,abstract,node idx,combined_text,voyage_embeddings
0,630234,spreadsheets on the move an evaluation of mobi...,The power of mobile devices has increased dram...,104447,Title spreadsheets on the move an evaluation o...,"[-0.04406767711043358, 0.035206135362386703, 0..."
1,803423,multi view metric learning for multi view vide...,Traditional methods on video summarization are...,15858,Title multi view metric learning for multi vie...,"[-0.0023108727764338255, 0.007105404045432806,..."
2,1102481,big data analytics in future internet of things,Current research on Internet of Things (IoT) m...,107156,Title big data analytics in future internet of...,"[-0.01104407012462616, 0.026394708082079887, 0..."
3,1532644,machine learner for automated reasoning 0 4 an...,Machine Learner for Automated Reasoning (MaLAR...,141536,Title machine learner for automated reasoning ...,"[0.0010582322720438242, 0.035436052829027176, ..."
4,1810480,cryptographic hardening of d sequences,This paper shows how a one-way mapping using m...,82077,Title cryptographic hardening of d sequences. ...,"[-0.017066944390535355, 0.04088375344872475, 0..."


In [29]:
voyage_embeddings.sort_values(by=['node idx'],inplace=True)

In [30]:
voyage_embeddings.head()

Unnamed: 0,paper id,title,abstract,node idx,combined_text,voyage_embeddings
35,9657784,evasion attacks against machine learning at te...,"In security-sensitive applications, the succes...",0,Title evasion attacks against machine learning...,"[-0.005053904838860035, 0.019030271098017693, ..."
155,39886162,how hard is computing parity with noisy commun...,We show a tight lower bound of $\Omega(N \log\...,1,Title how hard is computing parity with noisy ...,"[-0.02934980019927025, 0.02601485513150692, 0...."
426,116214155,on the absence of the rip in real world applic...,The purpose of this paper is twofold. The firs...,2,Title on the absence of the rip in real world ...,"[-0.03380873054265976, 0.004381511360406876, 0..."
438,121432379,a promise theory perspective on data networks,Networking is undergoing a transformation thro...,3,Title a promise theory perspective on data net...,"[-0.016072826460003853, 0.05101791396737099, 0..."
837,231147053,analysis of asymptotically optimal sampling ba...,Over the last 20 years significant effort has ...,4,Title analysis of asymptotically optimal sampl...,"[-0.011148830875754356, -0.022672971710562706,..."


In [31]:
import numpy as np
voyage_arr = []
for i in range(voyage_embeddings.shape[0]):
    num_lst = voyage_embeddings['voyage_embeddings'][i].strip('[]')
    numbers_list = [float(num) for num in num_lst.split(',')]
    arr = np.array(numbers_list)
    voyage_arr.append(arr)

In [32]:
voyage_arr = np.array(voyage_arr,dtype=np.float32)

In [33]:
graph['node_feat'] = voyage_arr

In [34]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

2.2.1+cpu


In [83]:
from torch_geometric.nn import GCNConv,SAGEConv
import torch.nn.functional as F


In [99]:

class SAGE(torch.nn.Module):
    def __init__(self, in_channels,
                 hidden_channels, out_channels,
                 n_layers=2):
        
        super(SAGE, self).__init__()
        self.n_layers = n_layers

        self.layers = torch.nn.ModuleList()
        self.layers_bn = torch.nn.ModuleList()

        if n_layers == 1:
            self.layers.append(SAGEConv(in_channels, out_channels, normalize=False))
        elif n_layers == 2:
            self.layers.append(SAGEConv(in_channels, hidden_channels, normalize=False))
            self.layers_bn.append(torch.nn.BatchNorm1d(hidden_channels))
            self.layers.append(SAGEConv(hidden_channels, out_channels, normalize=False))
        else:
            self.layers.append(SAGEConv(in_channels, hidden_channels, normalize=False))
            self.layers_bn.append(torch.nn.BatchNorm1d(hidden_channels))

            for _ in range(n_layers - 2):
                self.layers.append(SAGEConv(hidden_channels, hidden_channels, normalize=False))
                self.layers_bn.append(torch.nn.BatchNorm1d(hidden_channels))
            
            self.layers.append(SAGEConv(hidden_channels, out_channels, normalize=False))
            
        for layer in self.layers:
            layer.reset_parameters()

    def forward(self, x, edge_index):
        if len(self.layers) > 1:
            looper = self.layers[:-1]
        else:
            looper = self.layers
        
        for i, layer in enumerate(looper):
            x = layer(x, edge_index)
            try:
                x = self.layers_bn[i](x)
            except Exception as e:
                abs(1)
            finally:
                x = F.relu(x)
                x = F.dropout(x, p=0.5, training=self.training)
        
        if len(self.layers) > 1:
            x = self.layers[-1](x, edge_index)

        return F.log_softmax(x, dim=-1)
    
    def inference(self, total_loader, device):
        xs = []
        var_ = []
        for batch in total_loader:
            out, var = self.forward(batch.x.to(device), batch.edge_index.to(device))
            out = out[:batch.batch_size]
            xs.append(out.cpu())
            var_.append(var.item())
        
        out_all = torch.cat(xs, dim=0)
        
        return out_all, var_

In [89]:
torch.tensor(graph['node_feat'])

tensor([[-0.0441,  0.0352,  0.0395,  ...,  0.0078, -0.0506, -0.0571],
        [-0.0023,  0.0071,  0.0271,  ..., -0.0181, -0.0469, -0.0519],
        [-0.0110,  0.0264,  0.0183,  ..., -0.0091, -0.0550, -0.0526],
        ...,
        [-0.0190,  0.0277,  0.0447,  ..., -0.0312, -0.0689, -0.0576],
        [-0.0082,  0.0433,  0.0440,  ..., -0.0156, -0.0456, -0.0401],
        [-0.0276,  0.0150,  0.0252,  ..., -0.0049, -0.0640, -0.0262]])

In [90]:
np.unique(label)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39], dtype=int64)

In [78]:
label

array([[ 4],
       [ 5],
       [28],
       ...,
       [10],
       [ 4],
       [ 1]], dtype=int64)

In [101]:

from torch_geometric.data import Data
# Create a PyTorch Geometric Data object
data = Data(x=torch.tensor(graph['node_feat']), y=torch.tensor(label.squeeze()),edge_index=torch.tensor(graph['edge_index']))

In [102]:
model = SAGE(data.x.shape[1], 256, 40, n_layers=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.03)
criterion = torch.nn.CrossEntropyLoss()

In [103]:
def train(data):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x, data.edge_index)  # Perform a single forward pass.
      loss = criterion(out, data.y)  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

In [105]:

for epoch in range(1, 50):
    loss = train(data)
    _, pred = model(data.x,data.edge_index).max(dim=1)
    # correct = float (pred[data.train_mask].eq(data.y[data.train_mask]).sum().item())
    correct = float (pred.eq(data.y).sum().item())
    acc = correct / len(data.y)
    print('Epoch: %d, Accuracy: %.4f'%(epoch,acc))
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Epoch: 1, Accuracy: 0.3264
Epoch: 001, Loss: 3.9561
Epoch: 2, Accuracy: 0.3025
Epoch: 002, Loss: 4.2594
Epoch: 3, Accuracy: 0.3631
Epoch: 003, Loss: 4.0083
Epoch: 4, Accuracy: 0.3823
Epoch: 004, Loss: 3.0143
Epoch: 5, Accuracy: 0.3615
Epoch: 005, Loss: 2.8179
Epoch: 6, Accuracy: 0.4117
Epoch: 006, Loss: 2.7474
Epoch: 7, Accuracy: 0.4359
Epoch: 007, Loss: 2.4609
Epoch: 8, Accuracy: 0.4450
Epoch: 008, Loss: 2.2659


KeyboardInterrupt: 

In [104]:
data.x.shape[1]

128

In [96]:
data.y

tensor([ 4,  5, 28,  ..., 10,  4,  1])