# GraphSAGE on Python
Inspired in the following tutorial:
https://antonsruberts.github.io/graph/graphsage/

In [13]:
import pandas as pd
import numpy as np
import json
from sklearn import model_selection

RANDOM_STATE = 2023

## 1. Load dataset

The data that we need is:
1. Nodes (Each node id and initial features).
2. Edges (Or just a method for getting N neighbors from a node).

In [14]:
# Download dataset
# Source: https://snap.stanford.edu/data/github-social.html
!wget https://snap.stanford.edu/data/git_web_ml.zip

--2023-01-26 18:02:22--  https://snap.stanford.edu/data/git_web_ml.zip
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2396031 (2.3M) [application/zip]
Saving to: ‘git_web_ml.zip.1’


2023-01-26 18:02:26 (701 KB/s) - ‘git_web_ml.zip.1’ saved [2396031/2396031]



In [15]:
# Unzip Dataset
!unzip git_web_ml.zip

Archive:  git_web_ml.zip
replace git_web_ml/musae_git_edges.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [16]:
# Load edges
edges = pd.read_csv("git_web_ml/musae_git_edges.csv")
edges.sample(10)

Unnamed: 0,id_1,id_2
133025,10081,9485
117202,8753,3712
31652,11373,6645
192980,16260,17450
87304,6385,22872
212560,18876,27589
88318,6470,35365
75998,5529,26461
39552,2453,19430
256422,25728,26026


In [17]:
# Load features
with open("git_web_ml/musae_git_features.json") as f:
    features_dict = json.load(f)
max_feature = np.max([feat for feats in features_dict.values() for feat in feats])
features_matrix = np.zeros(shape=(len(features_dict), max_feature + 1))
i = 0
for node_id, feats in features_dict.items():
    for feat in feats:
        features_matrix[i, feat] = 1
    i += 1
features = pd.DataFrame(features_matrix, index = features_dict.keys())
features.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3995,3996,3997,3998,3999,4000,4001,4002,4003,4004
3196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Load nodes
nodes = pd.read_csv("git_web_ml/musae_git_target.csv")
nodes.index = nodes.id.astype(str)
# Use only the nodes that has features
nodes = nodes.loc[features.index, ("name", "ml_target")]
nodes.sample(10)

Unnamed: 0,name,ml_target
35484,EntranceJew,0
8018,boye,0
17123,qijiezhao,1
15840,blue951,0
2788,pcdotfan,0
10724,nigarcia88,1
30464,anfedorov,0
26789,imagine10255,0
5203,nbrosowsky,1
21824,shal,0


## 2. Model construction
1. Create `FeatureStore`, a `GraphStore`, and a `BaseSampler`

In [None]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
!pip install pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.13.0+cu116.html

In [36]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.loader import NeighborLoader

In [27]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out, neighbor_loader, accuracy_fn):
        # dim_in  : Input dimensions
        # dim_h   : Hidden layer dimensions
        # dim_out : Output dimensions
        super().__init__()
        self.sage1 = SAGEConv(dim_in, dim_h)
        self.sage2 = SAGEConv(dim_h, dim_out)
        self.optimizer = torch.optim.Adam(self.parameters(),
                                          lr=0.01,
                                          weight_decay=5e-4)
        self.neighbor_loader = neighbor_loader
        self.accuracy_fn = accuracy_fn

    def forward(self, x, edge_index):
        h = self.sage1(x, edge_index)
        h = torch.relu(h)
        h = F.dropout(h, p=0.5, training=self.training)
        h = self.sage2(h, edge_index)
        return h, F.log_softmax(h, dim=1)

    def fit(self, data, epochs):
        # data            : Unused
        # epochs          : Number of epochs
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = self.optimizer

        self.train()
        for epoch in range(epochs+1):
            total_loss = 0
            acc = 0
            val_loss = 0
            val_acc = 0

            # Train on batches
            for batch in self.neighbor_loader:
                optimizer.zero_grad()
                _, out = self(batch.x, batch.edge_index)
                loss = criterion(out[batch.train_mask], batch.y[batch.train_mask])
                total_loss += loss
                acc += self.accuracy_fn(out[batch.train_mask].argmax(dim=1), 
                                batch.y[batch.train_mask])
                loss.backward()
                optimizer.step()

                # Validation
                val_loss += criterion(out[batch.val_mask], batch.y[batch.val_mask])
                val_acc += self.accuracy_fn(out[batch.val_mask].argmax(dim=1), 
                                    batch.y[batch.val_mask])

            # Print metrics every 10 epochs
            if(epoch % 10 == 0):
                print(f'Epoch {epoch:>3} | Train Loss: {loss/len(self.neighbor_loader):.3f} '
                      f'| Train Acc: {acc/len(self.neighbor_loader)*100:>6.2f}% | Val Loss: '
                      f'{val_loss/len(self.neighbor_loader):.2f} | Val Acc: '
                      f'{val_acc/len(self.neighbor_loader)*100:.2f}%')

In [115]:
from torch_geometric.data.feature_store import FeatureStore

class MyFeatureStore(FeatureStore):
    def __init__(self):
        super().__init__()
        
    def _put_tensor(self, tensor, attr):
        raise Exception("_put_tensor")

    def _get_tensor(self, attr):
        raise Exception("_get_tensor")
    
    def _remove_tensor(self, attr):
        raise Exception("_remove_tensor")

    def _get_tensor_size(self, attr):
        raise Exception("_get_tensor_size")
    
    def get_all_tensor_attrs(self):
        raise Exception("get_all_tensor_attrs")
    
    def __len__(self):
        raise Exception("__len__")

In [125]:
from torch_geometric.data.graph_store import (
    EdgeAttr,
    EdgeTensorType,
    GraphStore,
)

class MyEdgeStore(GraphStore):
    def __init__(self):
        super().__init__()
        # This could be in MDB backend
        self.store: Dict[EdgeAttr, Tuple[Tensor, Tensor]] = {}

    @staticmethod
    def key(attr: EdgeAttr) -> str:
        return (attr.edge_type, attr.layout.value, attr.is_sorted, attr.size)

    def _put_edge_index(self, edge_index, edge_attr):
        try:
            self.store[MyGraphStore.key(edge_attr)] = edge_index
            return True
        except:
            return False

    def _get_edge_index(self, edge_attr):
        return self.store.get(MyGraphStore.key(edge_attr), None)

    def _remove_edge_index(self, edge_attr):
        raise Exception("Not implemented: _remove_edge_index")

    def get_all_edge_attrs(self):
        return [EdgeAttr(*key) for key in self.store]

In [113]:
# Input dimension
nfeatures = features.shape[1]
# Output dimension
nclasses  = nodes["ml_target"].nunique()
print(f"There are {nfeatures} features per node")
print(f"There are {nclasses} possible classes")
# 80% of nodes for training
train_size  = int(len(nodes) * 0.8)
# Training indices
train_index = np.random.choice(nodes.index, size=train_size, replace=False).astype(int)
# Pytorch compatibility
train_index = torch.tensor(train_index)

There are 4005 features per node
There are 2 possible classes


In [118]:
feature_store = MyFeatureStore()
graph_store   = MyGraphStore()

neighbor_loader = NeighborLoader(
    data=(feature_store, graph_store),
    num_neighbors=[10,10], 
    batch_size=16,
    input_nodes=train_index
)

Exception: ignored

In [35]:
model = GraphSAGE(
    dim_in=nfeatures, 
    dim_h=64, 
    dim_out=nclasses, 
    neighbor_loader=None, 
    accuracy_fn=None
)