In [None]:
# 0. setting

In [None]:
# variable names

# cora : original dataset
# cora_X : input X (torch_dataset like format)
# cora_X_train : masking for train (50%)
# cora_X_val : masking for validation (25%)
# cora_X_test : masking for test (25%)
# cora_Y : node class label Y
# cora_edge : edge_index (torch_dataset like format)

In [None]:
## Node Classification colab guide

# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

## Hyeongchan's setting

from google.colab import drive
drive.mount('/content/drive')
myroute = '/content/drive/MyDrive/graph'

2.0.0+cu118
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 1. data load

In [None]:
## Graph2Gauss's util.py (corafull 제공)

import numpy as np
import scipy.sparse as sp

def load_dataset(file_name):
    """Load a graph from a Numpy binary file.
    Parameters
    ----------
    file_name : str
        Name of the file to load.
    Returns
    -------
    graph : dict
        Dictionary that contains:
            * 'A' : The adjacency matrix in sparse matrix format
            * 'X' : The attribute matrix in sparse matrix format
            * 'z' : The ground truth class labels
            * Further dictionaries mapping node, class and attribute IDs
    """
    if not file_name.endswith('.npz'):
        file_name += '.npz'
    with np.load(file_name, allow_pickle = True) as loader:
        loader = dict(loader)
        A = sp.csr_matrix((loader['adj_data'], loader['adj_indices'],
                           loader['adj_indptr']), shape=loader['adj_shape'])

        X = sp.csr_matrix((loader['attr_data'], loader['attr_indices'],
                           loader['attr_indptr']), shape=loader['attr_shape'])

        z = loader.get('labels')

        graph = {
            'A': A,
            'X': X,
            'z': z
        }

        idx_to_node = loader.get('idx_to_node')
        if idx_to_node:
            idx_to_node = idx_to_node.tolist()
            graph['idx_to_node'] = idx_to_node

        idx_to_attr = loader.get('idx_to_attr')
        if idx_to_attr:
            idx_to_attr = idx_to_attr.tolist()
            graph['idx_to_attr'] = idx_to_attr

        idx_to_class = loader.get('idx_to_class')
        if idx_to_class:
            idx_to_class = idx_to_class.tolist()
            graph['idx_to_class'] = idx_to_class

        return graph


## Hyeongchan's setting

cora = load_dataset(myroute + '/corafull/cora.npz')

print(len(cora['idx_to_node'])) # nodes = 19793 (paper)
print(len(cora['idx_to_attr'])) # features = 8710 (bag of words)
print(len(cora['idx_to_class'])) # class = 70 (category)

19793
8710
70


In [None]:
# 2. model

In [None]:
# 2-1. MLP

In [None]:
## Hyeongchan's setting

cora_num_features = len(cora['idx_to_attr'])
cora_num_classes = len(cora['idx_to_class'])


## Node Classification colab guide

import torch
from torch.nn import Linear
import torch.nn.functional as F


class MLP(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.lin1 = Linear(cora_num_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, cora_num_classes)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return x

In [None]:
## Hyeongchan's setting

from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

# sparse matrix X -> torch

temp = cora['X'].tocoo()
cora_X = torch.sparse.LongTensor(torch.LongTensor([temp.row.tolist(), temp.col.tolist()]),
                                 torch.LongTensor(temp.data.astype(np.int32))).to_dense().float()

# mask index

temp = np.arange(len(cora_X))
np.random.shuffle(temp)

train_len = int(len(cora_X) * 0.5)
val_len = (len(cora_X) - train_len) // 2
test_len = len(cora_X) - train_len - val_len

print(train_len) # 50%
print(val_len) # 25%
print(test_len) # 25%

cora_X_train = temp[ : train_len]
cora_X_val = temp[train_len : train_len + val_len]
cora_X_test = temp[train_len + val_len : ]

print(cora_X_train) # train mask
print(cora_X_val) # validation mask
print(cora_X_test) # test mask

# label

cora_Y = torch.tensor(cora['z'])

9896
4948
4949
[ 2538  6961  4425 ...  2877 14371  8803]
[17972 19701 15026 ...   345 16880  6781]
[12459  4962  6205 ... 13680 14958 18552]


In [None]:
## Node Classification colab guide

from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = MLP(hidden_channels=16)
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(cora_X)  # Perform a single forward pass.
      loss = criterion(out[cora_X_train], cora_Y[cora_X_train])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(cora_X)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[cora_X_test] == cora_Y[cora_X_test]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / len(cora_X_test)  # Derive ratio of correct predictions.
      return test_acc

for epoch in range(1, 101): # 200 -> 100번으로 줄임
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Loss: 4.2562
Epoch: 002, Loss: 4.2492
Epoch: 003, Loss: 4.2424
Epoch: 004, Loss: 4.2352
Epoch: 005, Loss: 4.2281
Epoch: 006, Loss: 4.2200
Epoch: 007, Loss: 4.2113
Epoch: 008, Loss: 4.2029
Epoch: 009, Loss: 4.1938
Epoch: 010, Loss: 4.1851
Epoch: 011, Loss: 4.1751
Epoch: 012, Loss: 4.1655
Epoch: 013, Loss: 4.1548
Epoch: 014, Loss: 4.1452
Epoch: 015, Loss: 4.1354
Epoch: 016, Loss: 4.1253
Epoch: 017, Loss: 4.1167
Epoch: 018, Loss: 4.1053
Epoch: 019, Loss: 4.0975
Epoch: 020, Loss: 4.0896
Epoch: 021, Loss: 4.0812
Epoch: 022, Loss: 4.0757
Epoch: 023, Loss: 4.0656
Epoch: 024, Loss: 4.0579
Epoch: 025, Loss: 4.0569
Epoch: 026, Loss: 4.0486
Epoch: 027, Loss: 4.0469
Epoch: 028, Loss: 4.0459
Epoch: 029, Loss: 4.0417
Epoch: 030, Loss: 4.0394
Epoch: 031, Loss: 4.0379
Epoch: 032, Loss: 4.0372
Epoch: 033, Loss: 4.0365
Epoch: 034, Loss: 4.0358
Epoch: 035, Loss: 4.0365
Epoch: 036, Loss: 4.0330
Epoch: 037, Loss: 4.0357
Epoch: 038, Loss: 4.0318
Epoch: 039, Loss: 4.0343
Epoch: 040, Loss: 4.0324


In [None]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}') # 0.05 정도, 원래 이런가? -> 70개라 못 맞추는 건가?

Test Accuracy: 0.0525


In [None]:
# 2-2. Simple GCN

In [None]:
## Node Classification colab guide

from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(1234567)
        self.conv1 = GCNConv(cora_num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, cora_num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

In [None]:
## Hyeongchan's setting

cora_edge = torch.tensor(cora['A'].nonzero()).contiguous() # adjacency matrix -> edge index

cora_edge

tensor([[    0,     0,     0,  ..., 19791, 19791, 19791],
        [ 1227,  4021,  4105,  ...,  2099,  5100, 10850]], dtype=torch.int32)

In [None]:
## Node Classification colab guide

from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = GCN(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(cora_X, cora_edge)  # Perform a single forward pass.
      loss = criterion(out[cora_X_train], cora_Y[cora_X_train])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(cora_X, cora_edge)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[cora_X_test] == cora_Y[cora_X_test]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / len(cora_X_test)  # Derive ratio of correct predictions.
      return test_acc


for epoch in range(1, 101):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Loss: 4.2485
Epoch: 002, Loss: 4.2429
Epoch: 003, Loss: 4.2375
Epoch: 004, Loss: 4.2321
Epoch: 005, Loss: 4.2269
Epoch: 006, Loss: 4.2217
Epoch: 007, Loss: 4.2167
Epoch: 008, Loss: 4.2117
Epoch: 009, Loss: 4.2068
Epoch: 010, Loss: 4.2021
Epoch: 011, Loss: 4.1974
Epoch: 012, Loss: 4.1928
Epoch: 013, Loss: 4.1884
Epoch: 014, Loss: 4.1840
Epoch: 015, Loss: 4.1797
Epoch: 016, Loss: 4.1755
Epoch: 017, Loss: 4.1714
Epoch: 018, Loss: 4.1674
Epoch: 019, Loss: 4.1635
Epoch: 020, Loss: 4.1597
Epoch: 021, Loss: 4.1560
Epoch: 022, Loss: 4.1524
Epoch: 023, Loss: 4.1488
Epoch: 024, Loss: 4.1454
Epoch: 025, Loss: 4.1420
Epoch: 026, Loss: 4.1387
Epoch: 027, Loss: 4.1355
Epoch: 028, Loss: 4.1323
Epoch: 029, Loss: 4.1292
Epoch: 030, Loss: 4.1262
Epoch: 031, Loss: 4.1233
Epoch: 032, Loss: 4.1204
Epoch: 033, Loss: 4.1176
Epoch: 034, Loss: 4.1149
Epoch: 035, Loss: 4.1123
Epoch: 036, Loss: 4.1097
Epoch: 037, Loss: 4.1071
Epoch: 038, Loss: 4.1047
Epoch: 039, Loss: 4.1023
Epoch: 040, Loss: 4.0999


In [None]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}') # 똑같이 나오네?... -> 내가 잘못했나봄

Test Accuracy: 0.0525


In [None]:
# 2-3. GraphSAGE, GAT, GIN...?

In [None]:
# 99. others

In [None]:
# masking 없이 바로 자르기

# temp, cora_X_test = train_test_split(cora_X, test_size = 0.25)
# cora_X_train, cora_X_val = train_test_split(temp, test_size = 1/3)

# print(len(cora_X_train)) # train 50% = 9896
# print(len(cora_X_val)) # val 25% = 4948
# print(len(cora_X_test)) # test 25% = 4949

In [None]:
## GraphSMOTE's data_load.py (아마도 normal cora)

import scipy.sparse as sp
import numpy as np
# import utils

def load_data(path= 'data/cora/', dataset="cora"):#modified from code: pygcn
    """Load citation network dataset (cora only for now)"""
    #input: idx_features_labels, adj
    #idx,labels are not required to be processed in advance
    #adj: save in the form of edges. idx1 idx2 
    #output: adj, features, labels are all torch.tensor, in the dense form
    #-------------------------------------------------------

    print('Loading {} dataset...'.format(dataset))

    idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
                                        dtype=np.dtype(str))
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
    labels = idx_features_labels[:, -1]
    set_labels = set(labels)
    classes_dict = {c: np.arange(len(set_labels))[i] for i, c in enumerate(set_labels)}
    classes_dict = {'Neural_Networks': 0, 'Reinforcement_Learning': 1, 'Probabilistic_Methods': 2, 'Case_Based': 3, 'Theory': 4, 'Rule_Learning': 5, 'Genetic_Algorithms': 6}

    #ipdb.set_trace()
    labels = np.array(list(map(classes_dict.get, labels)))

    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
                                    dtype=np.int32)
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    features = normalize(features)

    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(labels)

    # utils.print_edges_num(adj.todense(), labels)

    adj = sparse_mx_to_torch_sparse_tensor(adj)
    #adj = torch.FloatTensor(np.array(adj.todense()))

    return adj, features, labels

def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [None]:
load_data()

Loading cora dataset...


(tensor(indices=tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
                        [   8,   14,  258,  ...,  774, 1389, 2344]]),
        values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
        size=(2708, 2708), nnz=10556, layout=torch.sparse_coo),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([0, 5, 1,  ..., 6, 3, 0]))