## Install library

In [1]:
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install -q torch-cluster -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install -q torch-geometric
!pip install ogb
!pip install umap-learn
!pip install seaborn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/108.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/108.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m102.4/108.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for torch-scatter (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.0/210.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for torch-sparse (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m849.8 kB/s[0m eta [36m0

## Import library

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import ModuleList
from torch_cluster import random_walk

from tqdm import tqdm
from torch_geometric.nn import SAGEConv, GraphConv
from torch_geometric.datasets import Flickr, Planetoid, Reddit
from torch_geometric.loader import GraphSAINTRandomWalkSampler, ClusterData, ClusterLoader, NeighborLoader, NeighborSampler as RawNeighborSampler
from torch_geometric.typing import WITH_TORCH_SPARSE
from torch_geometric.utils import degree
import torch_geometric.transforms as T

from sklearn.linear_model import LogisticRegression

import os.path as osp
import pandas as pd
import numpy as np
import collections
from pandas.core.common import flatten

# importing obg datatset
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator
from pandas.core.common import flatten
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(16.7,8.27)})
sns.set_theme(style="ticks")
import collections
from scipy.special import softmax
import umap

import argparse
import time

## Download dataset and retrieve information

In [36]:
# download and loading the obg dataset
path = osp.join('data', 'Reddit')
dataset = PygNodePropPredDataset('ogbn-products', path)

  self.data, self.slices = torch.load(self.processed_paths[0])


In [37]:
# split_idx contains a dictionary of train, validation and test node indices
split_idx = dataset.get_idx_split()
# predefined ogb evaluator method used for validation of predictions
evaluator = Evaluator(name='ogbn-products')

In [38]:
# lets check the node ids distribution of train, test and val
print('Number of training nodes:', split_idx['train'].size(0))
print('Number of validation nodes:', split_idx['valid'].size(0))
print('Number of test nodes:', split_idx['test'].size(0))

Number of training nodes: 196615
Number of validation nodes: 39323
Number of test nodes: 2213091


In [39]:
# loading the dataset
data = dataset[0]

In [40]:
# graph statistics of ogb-product graph
print("Number of nodes in the graph:", data.num_nodes)
print("Number of edges in the graph:", data.num_edges)
print("Node feature matrix with shape:", data.x.shape) # [num_nodes, num_node_features]
print("Graph connectivity in COO format with shape:", data.edge_index.shape) # [2, num_edges]
print("Target to train against :", data.y.shape)
print("Node feature length", dataset.num_features)

Number of nodes in the graph: 2449029
Number of edges in the graph: 123718280
Node feature matrix with shape: torch.Size([2449029, 100])
Graph connectivity in COO format with shape: torch.Size([2, 123718280])
Target to train against : torch.Size([2449029, 1])
Node feature length 100


In [41]:
# checking the number of unique labels
# there are 47 unique categories of product
data.y.unique()

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46])

In [42]:
# load integer to real product category from label mapping provided inside the dataset
df = pd.read_csv('data/Reddit/ogbn_products/mapping/labelidx2productcategory.csv.gz')

In [43]:
# creating a dictionary of product category and corresponding integer label
label_idx, prod_cat = df.iloc[: ,0].values, df.iloc[: ,1].values
label_mapping = dict(zip(label_idx, prod_cat))

In [44]:
# lets see some of the product categories
df[:10]

Unnamed: 0,label idx,product category
0,0,Home & Kitchen
1,1,Health & Personal Care
2,2,Beauty
3,3,Sports & Outdoors
4,4,Books
5,5,"Patio, Lawn & Garden"
6,6,Toys & Games
7,7,CDs & Vinyl
8,8,Cell Phones & Accessories
9,9,Grocery & Gourmet Food


In [45]:
# counting the numbers of samples for each category
y = data.y.tolist()
y = list(flatten(y))
count_y = collections.Counter(y)
print(count_y)

Counter({4: 668950, 7: 172199, 6: 158771, 3: 151061, 12: 131886, 2: 116043, 0: 114294, 8: 110796, 1: 109832, 13: 101541, 16: 83594, 21: 80795, 9: 67358, 10: 52345, 18: 49019, 24: 45406, 17: 42337, 5: 40715, 11: 32937, 42: 32500, 15: 26911, 20: 22575, 19: 17438, 23: 3653, 14: 3079, 25: 3024, 28: 1969, 29: 1561, 43: 1399, 22: 879, 36: 630, 44: 566, 26: 553, 37: 514, 32: 513, 31: 418, 30: 277, 27: 259, 34: 154, 38: 91, 41: 61, 35: 44, 39: 37, 33: 29, 45: 9, 40: 6, 46: 1})


# GraphSAGE

### Neighborhood Sampling

In [16]:
dataset = Planetoid(path, 'Cora', transform=T.NormalizeFeatures())
data = dataset[0]

In [30]:
class NeighborSampler(RawNeighborSampler):
    def sample(self, batch):
        batch = torch.tensor(batch)
        row, col, _ = self.adj_t.coo()

        # For each node in `batch`, we sample a direct neighbor (as positive
        # example) and a random node (as negative example):
        pos_batch = random_walk(row, col, batch, walk_length=1,
                                coalesced=False)[:, 1]

        neg_batch = torch.randint(0, self.adj_t.size(1), (batch.numel(), ),
                                  dtype=torch.long)

        batch = torch.cat([batch, pos_batch, neg_batch], dim=0)
        return super().sample(batch)

train_loader = NeighborSampler(data.edge_index, sizes=[10, 10], batch_size=256,
                               shuffle=True, num_nodes=data.num_nodes)

In [31]:
class SAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers):
        super(SAGE, self).__init__()
        self.num_layers = num_layers
        self.convs = nn.ModuleList()

        for i in range(num_layers):
            in_channels = in_channels if i == 0 else hidden_channels
            self.convs.append(SAGEConv(in_channels, hidden_channels))

    def forward(self, x, adjs):
        for i, (edge_index, _, size) in enumerate(adjs):
            x_target = x[:size[1]]  # Target nodes are always placed first.
            x = self.convs[i]((x, x_target), edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x

    def full_forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x

In [32]:
# Set CUDA to run GPU if have, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SAGE(data.num_node_features, hidden_channels=64, num_layers=2)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
x, edge_index = data.x.to(device), data.edge_index.to(device)
print(device)

cpu


In [33]:
# Train and run model
def train():
    model.train()

    total_loss = 0
    for batch_size, n_id, adjs in train_loader:
        # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
        adjs = [adj.to(device) for adj in adjs]
        optimizer.zero_grad()

        out = model(x[n_id], adjs)
        out, pos_out, neg_out = out.split(out.size(0) // 3, dim=0)

        pos_loss = F.logsigmoid((out * pos_out).sum(-1)).mean()
        neg_loss = F.logsigmoid(-(out * neg_out).sum(-1)).mean()
        loss = -pos_loss - neg_loss
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * out.size(0)

    return total_loss / data.num_nodes


@torch.no_grad()
def test():
    model.eval()
    out = model.full_forward(x, edge_index).cpu()

    clf = LogisticRegression()
    clf.fit(out[data.train_mask], data.y[data.train_mask])

    val_acc = clf.score(out[data.val_mask], data.y[data.val_mask])
    test_acc = clf.score(out[data.test_mask], data.y[data.test_mask])

    return val_acc, test_acc


for epoch in range(1, 51):
    loss = train()
    val_acc, test_acc = test()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, '
          f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')

Epoch: 001, Loss: 1.3886, Val: 0.3440, Test: 0.3430
Epoch: 002, Loss: 1.3487, Val: 0.4960, Test: 0.4780
Epoch: 003, Loss: 1.1727, Val: 0.5380, Test: 0.5470
Epoch: 004, Loss: 1.1219, Val: 0.5760, Test: 0.5860
Epoch: 005, Loss: 1.0801, Val: 0.6140, Test: 0.6230
Epoch: 006, Loss: 1.0677, Val: 0.6400, Test: 0.6370
Epoch: 007, Loss: 1.0661, Val: 0.6620, Test: 0.6460
Epoch: 008, Loss: 1.0562, Val: 0.6780, Test: 0.6730
Epoch: 009, Loss: 1.0210, Val: 0.6780, Test: 0.6870
Epoch: 010, Loss: 0.9991, Val: 0.6900, Test: 0.6950
Epoch: 011, Loss: 0.9919, Val: 0.6860, Test: 0.7060
Epoch: 012, Loss: 0.9889, Val: 0.6880, Test: 0.7020
Epoch: 013, Loss: 0.9837, Val: 0.6980, Test: 0.6940
Epoch: 014, Loss: 0.9800, Val: 0.7120, Test: 0.7140
Epoch: 015, Loss: 0.9956, Val: 0.7060, Test: 0.7230
Epoch: 016, Loss: 0.9766, Val: 0.7060, Test: 0.7190
Epoch: 017, Loss: 0.9730, Val: 0.7200, Test: 0.7310
Epoch: 018, Loss: 0.9777, Val: 0.7360, Test: 0.7340
Epoch: 019, Loss: 0.9702, Val: 0.7380, Test: 0.7360
Epoch: 020, 

In [34]:
# Print model info
print(model)

# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

SAGE(
  (convs): ModuleList(
    (0): SAGEConv(1433, 64, aggr=mean)
    (1): SAGEConv(64, 64, aggr=mean)
  )
)
Model's state_dict:
convs.0.lin_l.weight 	 torch.Size([64, 1433])
convs.0.lin_l.bias 	 torch.Size([64])
convs.0.lin_r.weight 	 torch.Size([64, 1433])
convs.1.lin_l.weight 	 torch.Size([64, 64])
convs.1.lin_l.bias 	 torch.Size([64])
convs.1.lin_r.weight 	 torch.Size([64, 64])


In [35]:
# saving model
fp = 'data/model.pt'

torch.save(model, './model.pt')
torch.save(model, fp)

# GraphSAINT

# Retrieve the data and check graph information


In [3]:
path = osp.join('data', 'cora')
dataset = Planetoid(path,name = 'cora')
data = dataset[0]
row, col = data.edge_index
data.edge_weight = 1. / degree(col, data.num_nodes)[col]  # Norm by in-degree.

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [4]:
# lets check some graph statistics of graph
print("Number of nodes in the graph:", data.num_nodes)
print("Number of edges in the graph:", data.num_edges)
print("Node feature matrix with shape:", data.x.shape) # [num_nodes, num_node_features]
print("Graph connectivity in COO format with shape:", data.edge_index.shape) # [2, num_edges]
print("Target to train against :", data.y.shape)
print("Node feature length", dataset.num_features)

Number of nodes in the graph: 2708
Number of edges in the graph: 10556
Node feature matrix with shape: torch.Size([2708, 1433])
Graph connectivity in COO format with shape: torch.Size([2, 10556])
Target to train against : torch.Size([2708])
Node feature length 1433


In [5]:
parser = argparse.ArgumentParser()
parser.add_argument('--use_normalization', required=False, action='store_true')
parser.add_argument("-f", required=False)
args = parser.parse_args()

loader = GraphSAINTRandomWalkSampler(data, batch_size=16, walk_length=2,
                                     num_steps=5, sample_coverage=10,
                                     save_dir=dataset.processed_dir)

Compute GraphSAINT normalization: : 27292it [00:00, 147049.84it/s]                         


In [6]:
# Model
class Net(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        in_channels = dataset.num_node_features
        out_channels = dataset.num_classes
        self.conv1 = GraphConv(in_channels, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, hidden_channels)
        self.conv3 = GraphConv(hidden_channels, hidden_channels)
        self.lin = torch.nn.Linear(3 * hidden_channels, out_channels)

    def set_aggr(self, aggr):
        self.conv1.aggr = aggr
        self.conv2.aggr = aggr
        self.conv3.aggr = aggr

    def forward(self, x0, edge_index, edge_weight=None):
        x1 = F.relu(self.conv1(x0, edge_index, edge_weight))
        x1 = F.dropout(x1, p=0.2, training=self.training)
        x2 = F.relu(self.conv2(x1, edge_index, edge_weight))
        x2 = F.dropout(x2, p=0.2, training=self.training)
        x3 = F.relu(self.conv3(x2, edge_index, edge_weight))
        x3 = F.dropout(x3, p=0.2, training=self.training)
        x = torch.cat([x1, x2, x3], dim=-1)
        x = self.lin(x)
        return x.log_softmax(dim=-1)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [9]:
def train():
    model.train()
    model.set_aggr('add' if args.use_normalization else 'mean')

    total_loss = total_examples = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()

        if args.use_normalization:
            edge_weight = data.edge_norm * data.edge_weight
            out = model(data.x, data.edge_index, edge_weight)
            loss = F.nll_loss(out, data.y, reduction='none')
            loss = (loss * data.node_norm)[data.train_mask].sum()
        else:
            out = model(data.x, data.edge_index)
            loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])

        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_nodes
        total_examples += data.num_nodes
    return total_loss / total_examples

In [10]:
@torch.no_grad()
def test():
    model.eval()
    model.set_aggr('mean')

    out = model(data.x.to(device), data.edge_index.to(device))
    pred = out.argmax(dim=-1)
    correct = pred.eq(data.y.to(device))

    accs = []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        accs.append(correct[mask].sum().item() / mask.sum().item())
    return accs

In [11]:
for epoch in range(15):
    loss = train()
    accs = test()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train: {accs[0]:.4f}, '
          f'Val: {accs[1]:.4f}, Test: {accs[2]:.4f}')

Epoch: 00, Loss: 1.9592, Train: 0.1786, Val: 0.1860, Test: 0.1630
Epoch: 01, Loss: nan, Train: 0.1857, Val: 0.2860, Test: 0.2690
Epoch: 02, Loss: 1.9437, Train: 0.2357, Val: 0.3860, Test: 0.3760
Epoch: 03, Loss: nan, Train: 0.2429, Val: 0.3840, Test: 0.3880
Epoch: 04, Loss: 1.9080, Train: 0.1500, Val: 0.3160, Test: 0.3200
Epoch: 05, Loss: 1.8143, Train: 0.1429, Val: 0.3180, Test: 0.3190
Epoch: 06, Loss: nan, Train: 0.1429, Val: 0.3160, Test: 0.3190
Epoch: 07, Loss: nan, Train: 0.1571, Val: 0.3180, Test: 0.3260
Epoch: 08, Loss: 1.8230, Train: 0.2857, Val: 0.3720, Test: 0.3900
Epoch: 09, Loss: 1.7032, Train: 0.2643, Val: 0.2360, Test: 0.2540
Epoch: 10, Loss: nan, Train: 0.2429, Val: 0.1680, Test: 0.1890
Epoch: 11, Loss: 1.7198, Train: 0.3714, Val: 0.2620, Test: 0.2770
Epoch: 12, Loss: 1.6336, Train: 0.3857, Val: 0.2840, Test: 0.3090
Epoch: 13, Loss: nan, Train: 0.3000, Val: 0.2000, Test: 0.2270
Epoch: 14, Loss: 1.4906, Train: 0.2143, Val: 0.1100, Test: 0.1450
