# GraphSAGE

In [57]:
# Necessary to import from sibling directory
import sys
sys.path.append("..")

from pymdb import MDBClient, BatchLoader, NodeIterator, FeatureStoreManager

import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv


## Model

In [58]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, dim_in: int, dim_h: int, dim_out: int, num_layers: int):
        super().__init__()
        self.dim_in = dim_in
        self.dim_h = dim_h
        self.dim_out = dim_out
        self.num_layers = num_layers

        if num_layers < 2:
            raise ValueError("Number of layers must be greater than 1")
        self.layers = torch.nn.ModuleList()
        self.layers.append(SAGEConv(dim_in, dim_h))
        for _ in range(num_layers - 2):
            self.layers.append(SAGEConv(dim_h, dim_h))
        self.layers.append(SAGEConv(dim_h, dim_out))

    def forward(
        self,
        node_features: torch.Tensor,  # [num_nodes, feature_size]
        edge_index: torch.Tensor,  # [2, feature_size]
    ):
        h = node_features
        for layer in self.layers[:-1]:
            h = layer(h, edge_index)
            h = F.relu(h)
            h = F.dropout(h, p=0.5, training=self.training)
        h = self.layers[-1](h, edge_index)
        return h, F.log_softmax(h, dim=1)  # (embedding, prediction)

    def fit(self, epochs, batch_loader):
        self.train()
        # Initialize parameters
        for layer in self.layers:
            layer.reset_parameters()
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01, weight_decay=5e-4)
        accuracy = lambda y_pred, y: ((y_pred == y).sum() / len(y)).item()
        for epoch in range(epochs + 1):
            # Train on batches
            for batch in batch_loader:
                optimizer.zero_grad()
                _, out = self(batch.node_features, batch.edge_index)

                loss = criterion(out, batch.node_labels)
                acc = accuracy(out.argmax(dim=1), batch.node_labels)

                loss.backward()
                optimizer.step()

            # Print metrics every 50 epochs
            if epoch % 50 == 0:
                print(
                    f"Epoch: {epoch:>3} | Train loss: {loss:.3f} | Train acc: {acc*100:>6.2f}%"
                )

    def predict(
        self,
        client: "MDBClient",
        initial_store_name: str,
        sampler: "Sampler",
        batch_size: int,
    ):
        """
        Maybe it would be better to use something like a batch loader that receives a
        batch of node_ids as seed nodes and return a subgraph where:
        1. The first node_ids.size() features are the seed nodes features on the same order
        2. The remaining nodes are the neighbors of the seed nodes at depth 1
        3. The edges are the ones between the seed nodes and their neighbors

        Then the output would be the slice of the matrix [0:node_ids.size()]

        Notes:
        - Batches should have the original node_ids for storing the embeddings
        - Maybe FeatureStore should be closed after each usage in both batch loader and
          node iterator. This is because during the evaluation it will be modified and
          the changes should be saved between iterations
        - Maybe it is necessary to check if FeatureStore is closed before using it
          throwing an exception from C++

        TODO: Implement this feature (multi_insert_tensor)
        """
        self.eval()

        fsm = FeatureStoreManager(client)
        node_iterator = NodeIterator(client=client, batch_size=batch_size)

        store_names = fsm.list()

        if "temp1" in store_names:
            fsm.remove("temp1")
        if "temp2" in store_names:
            fsm.remove("temp2")
        if "final" in store_names:
            fsm.remove("final")

        fsm.create(name="temp1", feature_size=self.dim_h)
        fsm.create(name="temp2", feature_size=self.dim_h)
        fsm.create(name="final", feature_size=self.dim_out)

        initial_store = fsm.open(initial_store_name)
        prev_store = fsm.open("temp1")
        curr_store = fsm.open("temp2")
        final_store = fsm.open("final")

        print(f"Evaluating layer 1/{self.num_layers}")
        for node_ids in node_iterator:
            node_id = node_ids[0]
            # TODO: neighbor_ids = sampler.neighbors(node_id=node_id, num_neighbors=5)
            neighbor_ids = []
            node_features = initial_store[[node_id, *neighbor_ids]]
            edge_index = torch.tensor(
                [[node_id] * len(neighbor_ids), neighbor_ids], dtype=torch.long
            )

            h = self.layers[0](node_features, edge_index)
            h = F.relu(h)
            h = F.dropout(h, p=0.5, training=self.training)
            prev_store[node_id] = h[0]

        # From now the variable prev_store is used for READ ONLY, while curr_store is
        # used for WRITE ONLY
        for idx, layer in enumerate(self.layers[1:-1]):
            print(f"Evaluating layer {idx+2}/{self.num_layers}")
            for node_ids in node_iterator:
                node_id = node_ids[0]
                # TODO: neighbor_ids = sampler.neighbors(node_id=node_id, num_neighbors=5)
                neighbor_ids = []
                node_features = prev_store[[node_id, *neighbor_ids]]
                edge_index = torch.tensor(
                    [[node_id] * len(neighbor_ids), neighbor_ids], dtype=torch.long
                )

                h = layer(node_features, edge_index)
                h = F.relu(h)
                h = F.dropout(h, p=0.5, training=self.training)
                curr_store[node_id] = h[0]
            # Swap store references
            prev_store, curr_store = curr_store, prev_store

        print(f"Evaluating layer {self.num_layers}/{self.num_layers}")
        for node_ids in node_iterator:
            node_id = node_ids[0]
            # TODO: neighbor_ids = sampler.neighbors(node_id=node_id, num_neighbors=5)
            neighbor_ids = []
            node_features = prev_store[[node_id, *neighbor_ids]]
            edge_index = torch.tensor(
                [[node_id] * len(neighbor_ids), neighbor_ids], dtype=torch.long
            )
            h = self.layers[-1](node_features, edge_index)
            final_store[node_id] = h[0]

        initial_store.close()
        prev_store.close()
        curr_store.close()
        final_store.close()

        # Remove temporary stores
        fsm.remove("temp1")
        fsm.remove("temp2")


## Train

In [59]:
with MDBClient() as client:
    # TODO: sampler = Sampler(client=client)
    # TODO: seeds = sampler.nodes(num_nodes=1000)

    batch_loader = BatchLoader(
        client=client,
        feature_store_name="github",
        num_seeds=128,
        batch_size=64,
        neighbor_sizes=[10, 5],
        seed=2023, # TODO: Remove seeded (not used)
    )

    model = GraphSAGE(dim_in=128, dim_h=64, dim_out=2, num_layers=5)
    print(model)
    model.fit(epochs=50, batch_loader=batch_loader)


GraphSAGE(
  (layers): ModuleList(
    (0): SAGEConv(128, 64, aggr=mean)
    (1): SAGEConv(64, 64, aggr=mean)
    (2): SAGEConv(64, 64, aggr=mean)
    (3): SAGEConv(64, 64, aggr=mean)
    (4): SAGEConv(64, 2, aggr=mean)
  )
)
Epoch:   0 | Train loss: 0.538 | Train acc:  83.26%
Epoch:  50 | Train loss: 0.299 | Train acc:  87.97%


## Predict

In [60]:
with MDBClient() as client:
    model.predict(
        client=client,
        initial_store_name="github",
        sampler=None,
        batch_size=1,
    )


Evaluating layer 1/5
Evaluating layer 2/5
Evaluating layer 3/5
Evaluating layer 4/5
Evaluating layer 5/5


AttributeError: 'FeatureStoreManager' object has no attribute 'close'