In [13]:
# !pip install ogb

Collecting ogb
  Downloading ogb-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m541.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting outdated>=0.2.0
  Downloading outdated-0.2.2-py2.py3-none-any.whl (7.5 kB)
Collecting scikit-learn>=0.20.0
  Downloading scikit_learn-1.2.1-cp310-cp310-macosx_10_9_x86_64.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting littleutils
  Downloading littleutils-0.2.2.tar.gz (6.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Building wheels for collected packages: littleutils
  Building wheel for littleutils (setup.py) ... [?25ldone
[?25h  Created wheel for littleutils: filename=littleutils-

In [136]:
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics.functional as MF
import dgl
import dgl.nn as dglnn
from dgl.data import AsNodePredDataset
from dgl.dataloading import DataLoader, NeighborSampler, MultiLayerFullNeighborSampler
from dgl.data import CiteseerGraphDataset, CoraGraphDataset, PubmedGraphDataset
from dgl import AddSelfLoop
from ogb.nodeproppred import DglNodePropPredDataset
import tqdm
import argparse 

In [122]:
ogbn_arxiv_dataset = AsNodePredDataset(DglNodePropPredDataset('ogbn-arxiv'))

Downloading http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip


Downloaded 0.08 GB: 100%|██████████| 81/81 [02:13<00:00,  1.65s/it]


Extracting dataset/arxiv.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 11781.75it/s]


Converting graphs into DGL objects...


100%|██████████| 1/1 [00:00<00:00, 63.10it/s]

Saving...





In [134]:
ogbn_arxiv_dataset[0].edata

{}

In [2]:
ds = dgl.data.CSVDataset('./cora_csv/')

Done loading data from cached files.


In [3]:
# dataset = AsNodePredDataset(DglNodePropPredDataset('ogbn-products'))
dataset = AsNodePredDataset(ds, split_ratio=(0.5,0.2,0.3))

In [4]:
dataset

Dataset("my_cora_dataset-as-nodepred", num_graphs=1, save_path=/Users/liuguizhou/.dgl/my_cora_dataset-as-nodepred)

In [5]:
g = dataset[0]

In [145]:
print(g)

Graph(num_nodes=2708, num_edges=5429,
      ndata_schemes={'feat': Scheme(shape=(1433,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})


In [32]:
g.ndata['feat'] = g.ndata['feat'].float()

In [33]:
in_size = g.ndata['feat'].shape[1]
out_size = dataset.num_classes

In [34]:
dataset.raw_path

'/Users/liuguizhou/.dgl/my_cora_dataset-as-nodepred'

In [35]:
dataset.num_classes

7

In [147]:
g.edges()

(tensor([   0,    0,    0,  ..., 1874, 1876, 1897]),
 tensor([  21,  905,  906,  ..., 2586, 1874, 2707]))

In [146]:
g.edges()[0].shape  # 5429条边, edges是tuple，表示从 a节点连接到b节点。

torch.Size([5429])

In [149]:
g.find_edges(2)  # 第二条边的为 0到906

(tensor([0]), tensor([906]))

In [152]:
g.in_degrees(), g.in_degrees().shape  # 表示第一个节点有3个in的连接

(tensor([3, 1, 0,  ..., 3, 3, 3]), torch.Size([2708]))

In [156]:
g.out_degrees()

tensor([166,   3,  42,  ...,   0,   0,   0])

In [161]:
g.in_degrees().float() ** 0.75

tensor([2.2795, 1.0000, 0.0000,  ..., 2.2795, 2.2795, 2.2795])

In [11]:
parser = argparse.ArgumentParser()
parser.add_argument('--mode', 
                    default='mixed',
                    choices=['cpu','mixed','puregpu'])
# args = parser.parse_args()
args = parser.parse_args(args=['--mode', 'cpu'])

In [12]:
args

Namespace(mode='cpu')

In [13]:
device = torch.device('cpu' if args.mode == 'cpu' else 'cuda')

In [85]:
class SAGE(nn.Module):
    def __init__(self, in_size, hid_size, out_size):
        super().__init__()
        self.layers = nn.ModuleList()
        self.layers.append(dglnn.SAGEConv(in_size,hid_size,'mean'))
        self.layers.append(dglnn.SAGEConv(hid_size,hid_size,'mean'))
        self.layers.append(dglnn.SAGEConv(hid_size,out_size,'mean'))
        self.dropout = nn.Dropout(0.5)
        self.hid_size = hid_size
        self.out_size = out_size
        
    def forward(self, blocks, x):
        # blocks 相当于根据layers选择的多个block，属于选择节点（特征为x）的邻居采样
        h = x
        for l, (layer, block) in enumerate(zip(self.layers, blocks)):
            h = layer(block, h)
            if l != len(self.layers) - 1:
                h = F.relu(h)
                h = self.dropout(h)
        return h 
    
    def inference(self, g, device, batch_size):
        """Conduct layer-wise inference to get all the node embeddings."""
        # 从feat到最终的输出，节点在不同的layer层有不同的emb表示。
        feat = g.ndata['feat']
        sampler = MultiLayerFullNeighborSampler(1,prefetch_node_feats=['feat'])
        dataloader = DataLoader(
            g, torch.arange(g.num_nodes()).to(g.device), sampler, 
            device=device, batch_size=batch_size, 
            shuffle=False, drop_last=False, num_workers=0)
        buffer_device = torch.device('cpu')
        pin_memory = (buffer_device != device)
        
        for l, layer in enumerate(self.layers):
            y = torch.empty(g.num_nodes(),
                            self.hid_size if l!=len(self.layers)-1 else self.out_size, 
                            device=buffer_device,
                            pin_memory=pin_memory)
            feat = feat.to(device)  # 在此的feat可能很大，但的确dataloader也是device
            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
                x = feat[input_nodes]
                h = layer(blocks[0], x)  # len(blocks)=1 因为是在前面 MultiLayerFullNeighborSampler只选择了一层
                if l != len(self.layers) - 1:
                    h = F.relu(h)
                    h = self.dropout(h)
                # by design, our output nodes are contiguous
                y[output_nodes[0]: output_nodes[-1]+1] = h.to(buffer_device)
            feat = y
            
        return y

In [86]:
model = SAGE(in_size, 256, out_size).to(device)

In [87]:
model

SAGE(
  (layers): ModuleList(
    (0): SAGEConv(
      (feat_drop): Dropout(p=0.0, inplace=False)
      (fc_self): Linear(in_features=1433, out_features=256, bias=False)
      (fc_neigh): Linear(in_features=1433, out_features=256, bias=False)
    )
    (1): SAGEConv(
      (feat_drop): Dropout(p=0.0, inplace=False)
      (fc_self): Linear(in_features=256, out_features=256, bias=False)
      (fc_neigh): Linear(in_features=256, out_features=256, bias=False)
    )
    (2): SAGEConv(
      (feat_drop): Dropout(p=0.0, inplace=False)
      (fc_self): Linear(in_features=256, out_features=7, bias=False)
      (fc_neigh): Linear(in_features=256, out_features=7, bias=False)
    )
  )
  (dropout): Dropout(p=0.5, inplace=False)
)

In [111]:
def compute_acc(pred, labels):
    """
    计算准确率
    """
    return (torch.argmax(pred, dim=1) == labels).float().sum() / len(pred)

In [112]:
def evaluate(model, graph, dataloader):
    model.eval()
    ys = []
    y_hats = []
    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
        with torch.no_grad():
            x = blocks[0].srcdata['feat']  # 最初 num_src_nodes（在第一层）的特征 feat
            ys.append(blocks[-1].dstdata['label'])  # 最后 num_dst_nodes （在最后层）的 label
            y_hats.append(model(blocks, x))
    return compute_acc(torch.cat(y_hats), torch.cat(ys))
    # return MF.accuracy(torch.cat(y_hats), torch.cat(ys))
    # return MF.accuracy(torch.cat(y_hats), torch.cat(ys), task='multiclass', num_classes=7)

In [119]:
def layerwise_infer(device, graph, nid, model, batch_size):
    model.eval()
    with torch.no_grad():
        pred = model.inference(graph, device, batch_size)  # pred in buffer_device
        pred = pred[nid]
        label = graph.ndata['label'][nid].to(pred.device)
        return compute_acc(pred, label)
        # return MF.accuracy(pred, label)
        # return MF.accuracy(pred, label, task='multiclass', num_classes=7)

In [114]:
def train(args, device, g, dataset, model):
    train_idx = dataset.train_idx.to(device)
    val_idx = dataset.val_idx.to(device)
    sampler = NeighborSampler([10,10,10],
                          prefetch_node_feats=['feat'],
                          prefetch_labels=['label'])
    use_uva = (args.mode == 'mixed')
    train_dataloader = DataLoader(g, train_idx, sampler, device=device,
                              batch_size=512,
                              shuffle=True, 
                              drop_last=False, 
                              num_workers=0,
                              use_uva=use_uva)
    val_dataloader = DataLoader(g, val_idx, sampler, device=device,
                              batch_size=512,
                              shuffle=True, 
                              drop_last=False, 
                              num_workers=0,
                              use_uva=use_uva)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
    
    for epoch in range(10):
        model.train()
        total_loss = 0
        for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
            x = blocks[0].srcdata['feat']
            y = blocks[-1].dstdata['label']
            y_hat = model(blocks, x)
            loss = F.cross_entropy(y_hat, y)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total_loss += loss.item()
        acc = evaluate(model, g, val_dataloader)
        print("Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} "
              .format(epoch, total_loss / (it+1), acc.item()))

In [99]:
model.eval()
ys = []
y_hats = []
for it, (input_nodes, output_nodes, blocks) in enumerate(val_dataloader):
    with torch.no_grad():
        x = blocks[0].srcdata['feat']  # 最初 num_src_nodes（在第一层）的特征 feat
        ys.append(blocks[-1].dstdata['label'])  # 最后 num_dst_nodes （在最后层）的 label
        y_hats.append(model(blocks, x))
# return MF.accuracy(torch.cat(y_hats), torch.cat(ys))
# return MF.accuracy(torch.cat(y_hats), torch.cat(ys), task='multiclass', num_classes=7)



In [101]:
blocks

[Block(num_src_nodes=149, num_dst_nodes=113, num_edges=164),
 Block(num_src_nodes=113, num_dst_nodes=73, num_edges=102),
 Block(num_src_nodes=73, num_dst_nodes=29, num_edges=46)]

In [108]:
torch.cat(y_hats).shape, torch.cat(ys).shape

(torch.Size([541, 7]), torch.Size([541]))

In [110]:
torch.argmax(torch.cat(y_hats), dim=1)

torch.Size([541])

In [115]:
train_idx = dataset.train_idx.to(device)
val_idx = dataset.val_idx.to(device)
sampler = NeighborSampler([10,10,10],
                      prefetch_node_feats=['feat'],
                      prefetch_labels=['label'])
use_uva = (args.mode == 'mixed')
train_dataloader = DataLoader(g, train_idx, sampler, device=device,
                          batch_size=512,
                          shuffle=True, 
                          drop_last=False, 
                          num_workers=0,
                          use_uva=use_uva)
val_dataloader = DataLoader(g, val_idx, sampler, device=device,
                          batch_size=512,
                          shuffle=True, 
                          drop_last=False, 
                          num_workers=0,
                          use_uva=use_uva)
opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)

for epoch in range(10):
    model.train()
    total_loss = 0
    for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
        x = blocks[0].srcdata['feat']
        y = blocks[-1].dstdata['label'].type(torch.LongTensor)
        y_hat = model(blocks, x)
        loss = F.cross_entropy(y_hat, y)
        opt.zero_grad()
        loss.backward()
        opt.step()
        total_loss += loss.item()
    acc = evaluate(model, g, val_dataloader)
    print("Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} "
          .format(epoch, total_loss / (it+1), acc.item()))

Epoch 00000 | Loss 0.1830 | Accuracy 0.8410 
Epoch 00001 | Loss 0.1612 | Accuracy 0.8429 
Epoch 00002 | Loss 0.1594 | Accuracy 0.8540 
Epoch 00003 | Loss 0.1375 | Accuracy 0.8540 
Epoch 00004 | Loss 0.1233 | Accuracy 0.8614 
Epoch 00005 | Loss 0.1004 | Accuracy 0.8558 
Epoch 00006 | Loss 0.1013 | Accuracy 0.8595 
Epoch 00007 | Loss 0.0819 | Accuracy 0.8614 
Epoch 00008 | Loss 0.0750 | Accuracy 0.8614 
Epoch 00009 | Loss 0.0798 | Accuracy 0.8503 


In [120]:
acc = layerwise_infer(device, g, dataset.test_idx, model, batch_size=4096)

100%|██████████| 1/1 [00:00<00:00, 32.98it/s]
100%|██████████| 1/1 [00:00<00:00, 36.89it/s]
100%|██████████| 1/1 [00:00<00:00, 67.61it/s]


In [121]:
acc

tensor(0.8339)