In [10]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch
import torch.nn as nn
from ogb.nodeproppred import PygNodePropPredDataset
from torch_sparse import SparseTensor
from torch_geometric.utils import to_undirected, add_self_loops

def load_ogb_data(data_name):
    dataset = PygNodePropPredDataset(name = 'ogbn-'+data_name) 
    graph = dataset[0]
    split_idx = dataset.get_idx_split()
    graph.train_idx, graph.valid_idx, graph.test_idx = split_idx["train"], split_idx["valid"], split_idx["test"] 
    # get SparseTensor adj and DAD 
    adj, D_isqrt = process_adj(graph)
    DAD, DA, AD = gen_normalized_adjs(adj, D_isqrt)
    graph.adj = DA
    graph.num_classes = dataset.num_classes
    # later think about normalize feature vector x
    return graph 
    
def process_adj(data):
    N = data.num_nodes
    data.edge_index = to_undirected(data.edge_index, N)
    row, col = data.edge_index
    adj = SparseTensor(row=row, col=col, sparse_sizes=(N, N))
    # self loop 
    adj = adj.set_diag()  
    
    deg = adj.sum(dim=1).to(torch.float)
    deg_inv_sqrt = deg.pow(-0.5)
    deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
    return adj, deg_inv_sqrt

def gen_normalized_adjs(adj, D_isqrt):
    DAD = D_isqrt.view(-1,1)*adj*D_isqrt.view(1,-1)
    DA = D_isqrt.view(-1,1) * D_isqrt.view(-1,1)*adj
    AD = adj*D_isqrt.view(1,-1) * D_isqrt.view(1,-1)
    return DAD, DA, AD

def power_method_with_beta(A, x, y, alpha=1, beta=0.1, t=50):
    # here y should only include training labels, nxc one hot
    inv_phi_times_x = x
    yyt_normalizer = y.matmul(y.sum(dim=0).view(-1,1)) + 1e-6
    for _ in range(t):
        part1 = A.matmul(inv_phi_times_x)
        if beta > 0:
            part2 = y.matmul(y.t().matmul(inv_phi_times_x))/yyt_normalizer
            part1 = (1-beta)*part1 + beta*part2
        inv_phi_times_x = alpha/(1+alpha)*part1 + 1/(1+alpha)*x
    return inv_phi_times_x
    
def gpca(data, nhid=64, nlayer=1, alpha=1., beta=0, act=nn.Identity()):
    """
    Currently needs the help of SparseTensor package. 
    TODO:
        1. Extend to supervised GPCA: when beta> 0 we use supervised GPCA [different from Leman's] [Done]
        2. Test add some non-linearity
        3. Adapt to torch_geometric datasets
        4. Study different normalization of x and A
        5. Think about how to do this batch-wise, which is important for initialization,
           also when can not fit into GPU
    Inputs:
        data.x - torch.Tensor
        data.y - torch.Tensor
        data.adj - torch_sparse.SparseTensor
    Output:
        x - new embeddings after gpca
    """
    N = data.num_nodes
    A = data.adj#.to_torch_sparse_coo_tensor()
    x = data.x    
    y_train = SparseTensor(row=data.train_idx, col=data.y.squeeze()[data.train_idx], 
                           sparse_sizes=(data.num_nodes, data.num_classes)) # one hot 
    for _ in range(nlayer):
        x = x - x.mean(dim=0)
#         x = x / (x.std(dim=0)+1e-6) # standarize to test 
#         pre_x = x
        inv_phi_times_x = power_method_with_beta(data.adj, x, y_train, alpha, beta)
        eig_val, eig_vec = torch.symeig(x.t().mm(inv_phi_times_x), eigenvectors=True)
       # weight = eig_vec[:,-nhid:] #when nhid is large than previous hidden size, we need to sample some eigenvectors.
        weight = torch.cat([eig_vec[:,-nhid//2:], -eig_vec[:,-nhid//2:]], dim=-1)
        x = inv_phi_times_x.mm(weight) 
        x = act(x)
#         x += pre_x
    return x

from torch.utils.data import Dataset
class TabularDataset(Dataset):
    def __init__(self, x, y, transform=None):
        self.transform = transform
        self.x = x
        self.y = y.squeeze()
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]    

In [4]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint
# from diffusion_feature import preprocess
# pytorch lighting based logistic regresison [easier to train, implemented already]
from pl_bolts.models.regression import LogisticRegression

def run_gpca(data_name, nhid, alpha, beta, nlayer, batch_size=256, max_epochs=50):
    device = torch.device("cuda"if torch.cuda.is_available() else "cpu")
    data = load_ogb_data(data_name).to(device)

    # add additonal embeddings
#     x_additional = torch.cat([preprocess(data, 'diffusion', post_fix=data_name), 
#                               preprocess(data, 'spectral', post_fix=data_name)], dim=-1).to(device)
#     data.x = torch.cat([data.x, x_additional], dim=-1)
    
    
    # run gpca embedding
    embeddings = gpca(data, nhid, alpha=alpha, beta=beta, nlayer=nlayer)

    # step 1: create dataloader
    data = data.to('cpu')
    embeddings = embeddings.to('cpu')
    
    train_dataset = TabularDataset(embeddings[data.train_idx], data.y[data.train_idx])
    valid_dataset = TabularDataset(embeddings[data.valid_idx], data.y[data.valid_idx])
    test_dataset = TabularDataset(embeddings[data.test_idx], data.y[data.test_idx])
    # DataLoader needs dataset in cpu when num_workers > 1
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=6, pin_memory=True) 
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=6, pin_memory=True) 
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=6, pin_memory=False) 

    # step 2: create model and trainer
    model = LogisticRegression(input_dim=nhid, num_classes=data.num_classes, 
                               learning_rate=0.001, l2_strength=5e-4)
    trainer = pl.Trainer(gpus=1, max_epochs=max_epochs, callbacks=[ModelCheckpoint(monitor='val_acc',mode='max')])

    # step 3: train and test
    trainer.fit(model, train_loader, valid_loader)
    valid_performance = trainer.test(model, test_dataloaders=valid_loader)
    test_performance = trainer.test(model, test_dataloaders=test_loader)
    
    
#     return valid_performance, test_performance
    
    
# device = 'cpu'  # use when can not fit into GPU
data_name = 'arxiv'
nhid = 128
alpha = 1
beta = 0
nlayer= 10
run_gpca(data_name, nhid, alpha, beta, nlayer, batch_size=512)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type   | Params
----------------------------------
0 | linear | Linear | 5.2 K 
----------------------------------
5.2 K     Trainable params
0         Non-trainable params
5.2 K     Total params


Epoch 0:  75%|███████▌  | 178/237 [00:00<00:00, 259.89it/s, loss=1.37, v_num=6, train_ce_loss=1.38]
Validating: 0it [00:00, ?it/s][A
Epoch 0: 100%|██████████| 237/237 [00:00<00:00, 251.78it/s, loss=1.37, v_num=6, train_ce_loss=1.38, val_ce_loss=1.43, val_acc=0.657]
Epoch 1:  75%|███████▌  | 178/237 [00:00<00:00, 254.47it/s, loss=1.17, v_num=6, train_ce_loss=1.24, val_ce_loss=1.43, val_acc=0.657]
Validating: 0it [00:00, ?it/s][A
Epoch 1: 100%|██████████| 237/237 [00:00<00:00, 253.23it/s, loss=1.17, v_num=6, train_ce_loss=1.24, val_ce_loss=1.2, val_acc=0.676] 
Epoch 2:  75%|███████▌  | 178/237 [00:00<00:00, 254.68it/s, loss=1.11, v_num=6, train_ce_loss=1.01, val_ce_loss=1.2, val_acc=0.676]
Validating: 0it [00:00, ?it/s][A
Epoch 2: 100%|██████████| 237/237 [00:00<00:00, 252.25it/s, loss=1.11, v_num=6, train_ce_loss=1.01, val_ce_loss=1.14, val_acc=0.68]
Epoch 3:  75%|███████▌  | 178/237 [00:00<00:00, 257.26it/s, loss=1.1, v_num=6, train_ce_loss=1.21, val_ce_loss=1.14, val_acc=0.68]  
Va

# Next
1. Adding node2vec additional embeddings /cite the paper[Done]
2. Think about nonlinearity and combining multiple layers embeddings
3. Run on products
4. Run multiple configurations
5. Is there any other ways to leverage LP (label information)? 
6. How to setup initialization to see the difference?

In [None]:
import torch
m = torch.rand(10)
m.nonzero().squeeze()

# OGB GCN implementation
Test the performance

In [20]:
import argparse

import torch
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv

from ogb.nodeproppred import PygNodePropPredDataset, Evaluator


class Logger(object):
    def __init__(self, runs, info=None):
        self.info = info
        self.results = [[] for _ in range(runs)]

    def add_result(self, run, result):
        assert len(result) == 3
        assert run >= 0 and run < len(self.results)
        self.results[run].append(result)

    def print_statistics(self, run=None):
        if run is not None:
            result = 100 * torch.tensor(self.results[run])
            argmax = result[:, 1].argmax().item()
            print(f'Run {run + 1:02d}:')
            print(f'Highest Train: {result[:, 0].max():.2f}')
            print(f'Highest Valid: {result[:, 1].max():.2f}')
            print(f'  Final Train: {result[argmax, 0]:.2f}')
            print(f'   Final Test: {result[argmax, 2]:.2f}')
        else:
            result = 100 * torch.tensor(self.results)

            best_results = []
            for r in result:
                train1 = r[:, 0].max().item()
                valid = r[:, 1].max().item()
                train2 = r[r[:, 1].argmax(), 0].item()
                test = r[r[:, 1].argmax(), 2].item()
                best_results.append((train1, valid, train2, test))

            best_result = torch.tensor(best_results)

            print(f'All runs:')
            r = best_result[:, 0]
            print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 1]
            print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 2]
            print(f'  Final Train: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 3]
            print(f'   Final Test: {r.mean():.2f} ± {r.std():.2f}')


class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout):
        super(GCN, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(in_channels, hidden_channels, cached=True))
        self.bns = torch.nn.ModuleList()
        self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(
                GCNConv(hidden_channels, hidden_channels, cached=True))
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        self.convs.append(GCNConv(hidden_channels, out_channels, cached=True))

        self.dropout = dropout

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, adj_t):
        for i, conv in enumerate(self.convs[:-1]):
            x = conv(x, adj_t)
#             x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x, adj_t)
        return x.log_softmax(dim=-1)


class SAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout):
        super(SAGE, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        self.bns = torch.nn.ModuleList()
        self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))

        self.dropout = dropout

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, adj_t):
        for i, conv in enumerate(self.convs[:-1]):
            x = conv(x, adj_t)
            x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x, adj_t)
        return x.log_softmax(dim=-1)


def train(model, data, train_idx, optimizer):
    model.train()

    optimizer.zero_grad()
    out = model(data.x, data.adj_t)[train_idx]
    loss = F.nll_loss(out, data.y.squeeze(1)[train_idx])
    loss.backward()
    optimizer.step()

    return loss.item()


@torch.no_grad()
def test(model, data, split_idx, evaluator):
    model.eval()

    out = model(data.x, data.adj_t)
    y_pred = out.argmax(dim=-1, keepdim=True)

    train_acc = evaluator.eval({
        'y_true': data.y[split_idx['train']],
        'y_pred': y_pred[split_idx['train']],
    })['acc']
    valid_acc = evaluator.eval({
        'y_true': data.y[split_idx['valid']],
        'y_pred': y_pred[split_idx['valid']],
    })['acc']
    test_acc = evaluator.eval({
        'y_true': data.y[split_idx['test']],
        'y_pred': y_pred[split_idx['test']],
    })['acc']

    return train_acc, valid_acc, test_acc


def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=128)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=500)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args('')
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-arxiv',
                                     transform=T.ToSparseTensor())

    data = dataset[0]
#     data.edge_index = to_undirected(data.edge_index, data.num_nodes)
#     data.adj_t = data.edge_index
    data.adj_t = data.adj_t.to_symmetric()
    data = data.to(device)

    split_idx = dataset.get_idx_split()
    train_idx = split_idx['train'].to(device)

    if args.use_sage:
        model = SAGE(data.num_features, args.hidden_channels,
                     dataset.num_classes, args.num_layers,
                     args.dropout).to(device)
    else:
        model = GCN(data.num_features, args.hidden_channels,
                    dataset.num_classes, args.num_layers,
                    args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-arxiv')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, data, train_idx, optimizer)
            result = test(model, data, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()


if __name__ == "__main__":
    main()

Namespace(device=0, dropout=0.5, epochs=500, hidden_channels=128, log_steps=1, lr=0.01, num_layers=3, runs=10, use_sage=False)
Run: 01, Epoch: 01, Loss: 3.6930, Train: 17.91%, Valid: 7.63% Test: 5.86%
Run: 01, Epoch: 02, Loss: 3.3665, Train: 17.91%, Valid: 7.63% Test: 5.86%
Run: 01, Epoch: 03, Loss: 3.2360, Train: 17.32%, Valid: 24.96% Test: 22.75%
Run: 01, Epoch: 04, Loss: 3.1203, Train: 15.32%, Valid: 17.78% Test: 23.91%
Run: 01, Epoch: 05, Loss: 3.0866, Train: 23.14%, Valid: 20.10% Test: 24.64%
Run: 01, Epoch: 06, Loss: 2.9869, Train: 17.99%, Valid: 7.74% Test: 5.89%
Run: 01, Epoch: 07, Loss: 2.9366, Train: 26.20%, Valid: 27.59% Test: 25.25%
Run: 01, Epoch: 08, Loss: 2.8721, Train: 28.45%, Valid: 30.34% Test: 27.18%
Run: 01, Epoch: 09, Loss: 2.8110, Train: 29.72%, Valid: 30.88% Test: 27.56%
Run: 01, Epoch: 10, Loss: 2.7263, Train: 32.89%, Valid: 32.81% Test: 29.52%
Run: 01, Epoch: 11, Loss: 2.6298, Train: 39.19%, Valid: 42.19% Test: 41.13%
Run: 01, Epoch: 12, Loss: 2.5528, Train: 42

Run: 01, Epoch: 109, Loss: 1.1697, Train: 68.49%, Valid: 68.74% Test: 67.75%
Run: 01, Epoch: 110, Loss: 1.1719, Train: 68.53%, Valid: 68.75% Test: 67.88%
Run: 01, Epoch: 111, Loss: 1.1696, Train: 68.49%, Valid: 68.38% Test: 67.11%
Run: 01, Epoch: 112, Loss: 1.1681, Train: 68.63%, Valid: 68.79% Test: 68.08%
Run: 01, Epoch: 113, Loss: 1.1661, Train: 68.66%, Valid: 68.65% Test: 67.37%
Run: 01, Epoch: 114, Loss: 1.1632, Train: 68.65%, Valid: 68.52% Test: 67.31%
Run: 01, Epoch: 115, Loss: 1.1643, Train: 68.66%, Valid: 68.89% Test: 68.01%
Run: 01, Epoch: 116, Loss: 1.1619, Train: 68.67%, Valid: 68.46% Test: 67.19%
Run: 01, Epoch: 117, Loss: 1.1550, Train: 68.70%, Valid: 68.82% Test: 68.12%
Run: 01, Epoch: 118, Loss: 1.1565, Train: 68.88%, Valid: 68.72% Test: 67.49%
Run: 01, Epoch: 119, Loss: 1.1549, Train: 68.90%, Valid: 68.97% Test: 67.73%
Run: 01, Epoch: 120, Loss: 1.1512, Train: 68.86%, Valid: 69.00% Test: 68.41%
Run: 01, Epoch: 121, Loss: 1.1488, Train: 68.87%, Valid: 68.43% Test: 66.91%

Run: 01, Epoch: 217, Loss: 1.0655, Train: 70.83%, Valid: 70.14% Test: 68.85%
Run: 01, Epoch: 218, Loss: 1.0659, Train: 70.83%, Valid: 70.14% Test: 68.81%
Run: 01, Epoch: 219, Loss: 1.0643, Train: 70.84%, Valid: 70.29% Test: 69.25%
Run: 01, Epoch: 220, Loss: 1.0693, Train: 70.86%, Valid: 70.17% Test: 68.99%
Run: 01, Epoch: 221, Loss: 1.0649, Train: 70.91%, Valid: 70.36% Test: 69.24%
Run: 01, Epoch: 222, Loss: 1.0645, Train: 70.85%, Valid: 70.13% Test: 68.69%
Run: 01, Epoch: 223, Loss: 1.0630, Train: 70.84%, Valid: 69.96% Test: 68.46%
Run: 01, Epoch: 224, Loss: 1.0694, Train: 70.85%, Valid: 70.22% Test: 69.55%
Run: 01, Epoch: 225, Loss: 1.0690, Train: 70.95%, Valid: 69.77% Test: 68.06%
Run: 01, Epoch: 226, Loss: 1.0647, Train: 70.95%, Valid: 70.23% Test: 68.88%
Run: 01, Epoch: 227, Loss: 1.0653, Train: 70.96%, Valid: 70.35% Test: 69.49%
Run: 01, Epoch: 228, Loss: 1.0646, Train: 70.87%, Valid: 69.61% Test: 67.84%
Run: 01, Epoch: 229, Loss: 1.0654, Train: 70.89%, Valid: 70.41% Test: 69.67%

KeyboardInterrupt: 