In [1]:
import json
import os
from collections import namedtuple
import scipy.sparse
from sklearn.preprocessing import StandardScaler
import dgl
import numpy as np
import torch
from sklearn.metrics import f1_score

# Data Load
From : utils.py > load_data()

In [5]:
args = None
multilabel = True

In [9]:
if not os.path.exists('graphsaintdata') and not os.path.exists('data'):
    raise ValueError("The directory graphsaintdata does not exist!")
elif os.path.exists('graphsaintdata') and not os.path.exists('data'):
    os.rename('graphsaintdata', 'data')
# prefix = "data/{}".format(args.dataset)#################################
prefix = "data/ppi"#################################
DataType = namedtuple('Dataset', ['num_classes', 'train_nid', 'g'])

In [10]:
adj_full = scipy.sparse.load_npz('./{}/adj_full.npz'.format(prefix)).astype(bool)#################
g = dgl.from_scipy(adj_full)
num_nodes = g.num_nodes()

In [19]:
adj_full

<14755x14755 sparse matrix of type '<class 'numpy.bool_'>'
	with 450540 stored elements in Compressed Sparse Row format>

In [11]:
adj_train = scipy.sparse.load_npz('./{}/adj_train.npz'.format(prefix)).astype(bool)#############
train_nid = np.array(list(set(adj_train.nonzero()[0])))

In [31]:
# Experiement : train_nid = np.array(list(set(adj_train.nonzero()[0])))

from scipy.sparse import csr_matrix
A = csr_matrix([[1,2,0],[0,0,3],[4,0,5]])

A.nonzero()[0]  # Extract only row indices of Nonzero entries
set(A.nonzero()[0]) # Extract row indices with at least one nonzero entry

{0, 1, 2}

# train_nid == Num of nodes having at least one edge

In [12]:
role = json.load(open('./{}/role.json'.format(prefix)))
mask = np.zeros((num_nodes,), dtype=bool)
train_mask = mask.copy()
train_mask[role['tr']] = True
val_mask = mask.copy()
val_mask[role['va']] = True
test_mask = mask.copy()
test_mask[role['te']] = True

In [33]:
role.keys()

dict_keys(['tr', 'va', 'te'])

# Q : What is 'va', 'te'?

In [13]:
feats = np.load('./{}/feats.npy'.format(prefix))
scaler = StandardScaler()
scaler.fit(feats[train_nid])
feats = scaler.transform(feats)

In [34]:
feats

array([[-0.08760569, -0.08760569, -0.1132336 , ..., -0.13184157,
        -0.14681277, -0.14717815],
       [-0.08760569, -0.08760569, -0.1132336 , ..., -0.13184157,
        -0.14681277, -0.14717815],
       [-0.08760569, -0.08760569, -0.1132336 , ..., -0.13184157,
        -0.14681277, -0.14717815],
       ...,
       [-0.08760569, -0.08760569, -0.1132336 , ..., -0.13184157,
        -0.14681277, -0.14717815],
       [-0.08760569, -0.08760569, -0.1132336 , ..., -0.13184157,
        -0.14681277, -0.14717815],
       [-0.08760569, -0.08760569, -0.1132336 , ..., -0.13184157,
        -0.14681277, -0.14717815]])

In [14]:
# class_map for labels for each nodes (ppi is multilabel)

class_map = json.load(open('./{}/class_map.json'.format(prefix)))
class_map = {int(k): v for k, v in class_map.items()}

In [36]:
len(class_map)

14755

In [39]:
sum(class_map[0])

31

In [38]:
multilabel

True

In [15]:
if multilabel:
    # Multi-label binary classification
    num_classes = len(list(class_map.values())[0])
    class_arr = np.zeros((num_nodes, num_classes))
    for k, v in class_map.items():
        class_arr[k] = v
else:
    num_classes = max(class_map.values()) - min(class_map.values()) + 1
    class_arr = np.zeros((num_nodes,))
    for k, v in class_map.items():
        class_arr[k] = v

In [43]:
# class_arr for mapping edges to classes (labels)
class_arr.shape

(14755, 121)

In [16]:
g.ndata['feat'] = torch.tensor(feats, dtype=torch.float)
g.ndata['label'] = torch.tensor(class_arr, dtype=torch.float if multilabel else torch.long)
g.ndata['train_mask'] = torch.tensor(train_mask, dtype=torch.bool)
g.ndata['val_mask'] = torch.tensor(val_mask, dtype=torch.bool)
g.ndata['test_mask'] = torch.tensor(test_mask, dtype=torch.bool)

In [17]:
data = DataType(g=g, num_classes=num_classes, train_nid=train_nid)
data

Dataset(num_classes=121, train_nid=array([   0,    1,    2, ..., 9713, 9714, 9715]), g=Graph(num_nodes=14755, num_edges=450540,
      ndata_schemes={'feat': Scheme(shape=(50,), dtype=torch.float32), 'label': Scheme(shape=(121,), dtype=torch.float32), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={}))

Something wrong with num_edges:
Here, they just used sparse matrix (= row x col),
while CORA dataset extracted only row/col with edges.

i.e. 
1. in ppi, there can be node without edge
2. in CORA, all nodes have at least one edge.

# Preprocessing
From: train_sampling.py > main()

In [44]:
# load and preprocess dataset

# data = load_data(args, multilabel)
g = data.g
train_mask = g.ndata['train_mask']
val_mask = g.ndata['val_mask']
test_mask = g.ndata['test_mask']
labels = g.ndata['label']

train_nid = data.train_nid

in_feats = g.ndata['feat'].shape[1]
n_classes = data.num_classes
n_nodes = g.num_nodes()
n_edges = g.num_edges()

n_train_samples = train_mask.int().sum().item()
n_val_samples = val_mask.int().sum().item()
n_test_samples = test_mask.int().sum().item()

print("""----Data statistics------'
#Nodes %d
#Edges %d
#Classes/Labels (multi binary labels) %d
#Train samples %d
#Val samples %d
#Test samples %d""" %
        (n_nodes, n_edges, n_classes,
        n_train_samples,
        n_val_samples,
        n_test_samples))
# load sampler


----Data statistics------'
#Nodes 14755
#Edges 450540
#Classes/Labels (multi binary labels) 121
#Train samples 9716
#Val samples 1825
#Test samples 3214
