

---
본 문서는 [Colab Link](https://colab.research.google.com/drive/1DIQm9rOx2mT1bZETEeVUThxcrP1RKqAn) 를 기반으로 작성했습니다.


---




## **SetUp**

In [5]:
import torch
print(torch.__version__)

1.9.0+cu111


In [6]:
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.9.0+cu111.html

Looking in links: https://data.pyg.org/whl/torch-1.9.0+cu111.html


In [7]:
!pip install tensorboardX



In [47]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch_geometric.nn as pyg_nn
import torch_geometric.utils as pyg_utils

import time
from datetime import datetime

import networkx as nx
import numpy as np
import torch
import torch.optim as optim

from torch_geometric.datasets import TUDataset
from torch_geometric.datasets import Planetoid
from torch_geometric.loader import DataLoader

import torch_geometric.transforms as T

from tensorboardX import SummaryWriter
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

## **Defining the Model**

* using `GNNStack` 
      
* grpah classification : 
> 3 layers of convolution     
> mean pooling      
> 2 fully-connected layers    
> loss function : negative log-likelihood
     
* node classification
> 3 layers of convolution    
> 2 fully-connected layers    
> loss function : negative log-likelihood

#### (1) Node Classification

In [34]:
# defining Model
class GNNStack(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.dropout = 0.25
        self.num_layers = 3

        self.convs = nn.ModuleList()
        self.convs.append(self.build_conv_model(input_dim, hidden_dim))
        self.lns = nn.ModuleList()
        self.lns.append(nn.LayerNorm(hidden_dim))
        self.lns.append(nn.LayerNorm(hidden_dim))

        for l in range(2):
            self.convs.append(self.build_conv_model(hidden_dim, hidden_dim))
        
        # post message passing
        self.post_mp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.Dropout(0.250),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def build_conv_model(self, input_dim, hidden_dim):
        return pyg_nn.GCNConv(input_dim, hidden_dim)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        # feature가 없다면 1로 구성된 벡터 생성
        if data.num_node_features == 0:
            x = torch.ones(data.num_nodes,1)
        
        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            emb = x
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            if not i == self.num_layers-1:
                x = self.lns[i](x)
        x = self.post_mp(x)
        return emb, F.log_softmax(x, dim=1)

    def loss(self, pred, label):
        return F.nll_loss(pred, label)    

In [35]:
def train(dataset, writer=None):
    test_loader = loader = DataLoader(dataset, batch_size=64, shuffle=True)

    # build model
    model = GNNStack(max(dataset.num_node_features,1), 32, dataset.num_classes)
    opt = optim.Adam(model.parameters(), lr=0.01)

    # train
    for epoch in range(200):
        total_loss = 0
        model.train()
        for batch in loader:
            opt.zero_grad()
            embedding, pred = model(batch)
            label = batch.y
            
            pred = pred[batch.train_mask]
            label = label[batch.train_mask]

            loss = F.nll_loss(pred, label)
            # backpropagation 단계, autograd를 실행한다고 생각
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss = total_loss / len(loader.dataset)
        if writer != None:
            writer.add_scalar('loss', total_loss, epoch)

        if epoch % 10 == 0:
            test_acc = test(test_loader, model)
            print(f'[ Epoch {epoch} ] Loss :{total_loss:.4f}, Test accuracy : {test_acc:.4f}')
            if writer != None:
                writer.add_scalar('test accuracy', test_acc, epoch)
    return model

def test(loader, model, is_valid=False):
    model.eval()

    correct = 0
    for data in loader:
        with torch.no_grad():
            emb, pred = model(data)
            pred = pred.argmax(dim=1)
            label = data.y
        
        mask = data.val_mask if is_valid else data.test_mask
        pred = pred[mask]
        label = data.y[mask]
    correct += pred.eq(label).sum().item()

    total = 0
    for data in loader.dataset:
        total += torch.sum(data.test_mask).item()
    return correct / total


In [36]:
dataset = Planetoid(root='./tmp/cora', name='cora')

In [37]:
print('CORA dataset')
print(f'# graphs = {len(dataset)}')
print(f'# classes = {dataset.num_classes}')
print(f'# features = {dataset.num_features}')
print(f'# node features = {dataset.num_node_features}')
print('\n')

d = dataset[0]
print(d)
print(f'# nodes = {d.num_nodes}\n')
print(f'd.x ({d.x.shape}) =\n{d.x}\n')
print(f'd.edge_index ({d.edge_index.shape}) =\n{d.edge_index}\n')

CORA dataset
# graphs = 1
# classes = 7
# features = 1433
# node features = 1433


Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
# nodes = 2708

d.x (torch.Size([2708, 1433])) =
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

d.edge_index (torch.Size([2, 10556])) =
tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [ 633, 1862, 2582,  ...,  598, 1473, 2706]])



In [38]:
model = train(dataset)

[ Epoch 0 ] Loss :1.9539, Test accuracy : 0.1710




[ Epoch 10 ] Loss :0.3338, Test accuracy : 0.7560
[ Epoch 20 ] Loss :0.0477, Test accuracy : 0.7660
[ Epoch 30 ] Loss :0.0585, Test accuracy : 0.7140
[ Epoch 40 ] Loss :0.0096, Test accuracy : 0.7560
[ Epoch 50 ] Loss :0.0092, Test accuracy : 0.7490
[ Epoch 60 ] Loss :0.0348, Test accuracy : 0.7370
[ Epoch 70 ] Loss :0.0404, Test accuracy : 0.7440
[ Epoch 80 ] Loss :0.0022, Test accuracy : 0.7620
[ Epoch 90 ] Loss :0.0033, Test accuracy : 0.7750
[ Epoch 100 ] Loss :0.0010, Test accuracy : 0.7520
[ Epoch 110 ] Loss :0.0015, Test accuracy : 0.7130
[ Epoch 120 ] Loss :0.0019, Test accuracy : 0.7510
[ Epoch 130 ] Loss :0.0015, Test accuracy : 0.7600
[ Epoch 140 ] Loss :0.0006, Test accuracy : 0.7580
[ Epoch 150 ] Loss :0.0014, Test accuracy : 0.7590
[ Epoch 160 ] Loss :0.0003, Test accuracy : 0.7640
[ Epoch 170 ] Loss :0.0006, Test accuracy : 0.7720
[ Epoch 180 ] Loss :0.0001, Test accuracy : 0.7740
[ Epoch 190 ] Loss :0.0002, Test accuracy : 0.7710




---
#### (2) Graph classification


In [48]:
class GNNStack(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.dropout = 0.25
        self.num_layers = 3

        self.convs = nn.ModuleList()
        self.convs.append(self.build_conv_model(input_dim, hidden_dim))
        self.lns = nn.ModuleList()
        self.lns.append(nn.LayerNorm(hidden_dim))
        self.lns.append(nn.LayerNorm(hidden_dim))

        for l in range(2):
            self.convs.append(self.build_conv_model(hidden_dim, hidden_dim))
        
        # post message passing
        self.post_mp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.Dropout(0.250),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def build_conv_model(self, input_dim, hidden_dim):
        return pyg_nn.GINConv(
            nn.Sequential(nn.Linear(input_dim, hidden_dim),
                          nn.ReLU(),
                          nn.Linear(hidden_dim,hidden_dim))
        )

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        # feature가 없다면 1로 구성된 벡터 생성
        if data.num_node_features == 0:
            x = torch.ones(data.num_nodes,1)
        
        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            emb = x
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            if not i == self.num_layers-1:
                x = self.lns[i](x)
        x = pyg_nn.global_mean_pool(x, batch)
        x = self.post_mp(x)
        return emb, F.log_softmax(x, dim=1)

    def loss(self, pred, label):
        return F.nll_loss(pred, label)   

In [49]:
def train(dataset, writer=None):
    data_size = len(dataset)
    loader = DataLoader(dataset[:int(data_size*0.8)], batch_size=64, shuffle=True)
    test_loader = DataLoader(dataset[int(data_size*0.8):], batch_size=64, shuffle=True)
    
    # build model
    model = GNNStack(max(dataset.num_node_features,1), 32, dataset.num_classes)
    opt = optim.Adam(model.parameters(), lr=0.01)

    # train
    for epoch in range(200):
        total_loss = 0
        model.train()
        for batch in loader:
            opt.zero_grad()
            embedding, pred = model(batch)
            label = batch.y
            
            loss = F.nll_loss(pred, label)
            # backpropagation 단계, autograd를 실행한다고 생각
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss = total_loss / len(loader.dataset)
        if writer != None:
            writer.add_scalar('loss', total_loss, epoch)

        if epoch % 10 == 0:
            test_acc = test(test_loader, model)
            print(f'[ Epoch {epoch} ] Loss :{total_loss:.4f}, Test accuracy : {test_acc:.4f}')
            if writer != None:
                writer.add_scalar('test accuracy', test_acc, epoch)
    return model

def test(loader, model, is_valid=False):
    model.eval()

    correct = 0
    for data in loader:
        with torch.no_grad():
            emb, pred = model(data)
            pred = pred.argmax(dim=1)
            label = data.y
        
        correct += pred.eq(label).sum().item()

    total = len(loader.dataset)
    return correct / total

In [41]:
dataset = TUDataset(root='./tmp/ENZYMES', name='ENZYMES')
dataset = dataset.shuffle()

Downloading https://www.chrsmrrs.com/graphkerneldatasets/ENZYMES.zip
Extracting tmp/ENZYMES/ENZYMES/ENZYMES.zip
Processing...
Done!


In [50]:
model = train(dataset)

[ Epoch 0 ] Loss :1.8258, Test accuracy : 0.2250
[ Epoch 10 ] Loss :1.7815, Test accuracy : 0.2250
[ Epoch 20 ] Loss :1.7558, Test accuracy : 0.1917
[ Epoch 30 ] Loss :1.7432, Test accuracy : 0.2167
[ Epoch 40 ] Loss :1.7480, Test accuracy : 0.1917
[ Epoch 50 ] Loss :1.7444, Test accuracy : 0.2333
[ Epoch 60 ] Loss :1.7563, Test accuracy : 0.1917
[ Epoch 70 ] Loss :1.7417, Test accuracy : 0.2333
[ Epoch 80 ] Loss :1.7571, Test accuracy : 0.2833
[ Epoch 90 ] Loss :1.7417, Test accuracy : 0.2250
[ Epoch 100 ] Loss :1.7264, Test accuracy : 0.2250
[ Epoch 110 ] Loss :1.7220, Test accuracy : 0.2417
[ Epoch 120 ] Loss :1.7281, Test accuracy : 0.3250
[ Epoch 130 ] Loss :1.7298, Test accuracy : 0.2917
[ Epoch 140 ] Loss :1.6961, Test accuracy : 0.2583
[ Epoch 150 ] Loss :1.7126, Test accuracy : 0.3083
[ Epoch 160 ] Loss :1.7180, Test accuracy : 0.3417
[ Epoch 170 ] Loss :1.7229, Test accuracy : 0.2500
[ Epoch 180 ] Loss :1.7032, Test accuracy : 0.2500
[ Epoch 190 ] Loss :1.6979, Test accuracy 