In [53]:
import os.path as osp
import pickle
from collections import namedtuple
import itertools

import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F

In [54]:
Data = namedtuple(
    'Data', ['x', 'y', 'adjacency', 'train_mask', 'val_mask', 'test_mask'])

In [55]:
class CoraData(object):

    def __init__(self, data_root="../dataset/Cora", rebuild=False):
        """
        Cora数据集，对指定目录下的原始Cora数据集进行处理，然后返回处理后的命名元组，该元组包含以下内容:
            x: 所有节点的特征，shape为(2708, 1433)
            y: 所有节点的label，shape为(2708, 1)
            adjacency: 所有节点的邻接矩阵，shape为(2708, 2708)，这里采用稀疏矩阵存储
            train_mask: 训练集掩码向量，shape为(2708, )属于训练集的位置值为True，否则False，共140个
            val_mask: 训练集掩码向量，shape为(2708, )属于验证集的位置值为True，否则False，500
            test_mask: 训练集掩码向量，shape为(2708, )属于测试集的位置值为True，否则False，共1000个
        :param data_root: 数据集根目录，原始数据集为 {data_root}/raw，处理后的数据为{data_root}/processed_cora.pkl
        :param rebuild:
        """
        self.data_root = data_root
        self.filenames = [
            "ind.cora.{}".format(name) for name in
            ['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index']
        ]
        save_file = osp.join(self.data_root, "processed_cora.pkl")
        if osp.exists(save_file) and not rebuild:
            print("Using Cached file: {}".format(save_file))
            self._data = pickle.load(open(save_file, "rb"))
        else:
            self._data = self.process_data()
            with open(save_file, "wb") as f:
                pickle.dump(self.data, f)
            print("Cached file: {}".format(save_file))

    @property
    def data(self):
        return self._data

    def process_data(self):
        """
        处理数据，得到节点特征和标签，邻接矩阵，训练集、验证集以及测试集
        参考 https://github.com/FighterLYL/GraphNeuralNetwork
        引用自 https://github.com/rusty1s/pytorch_geometric
        """
        print("Process data ...")
        _, tx, allx, y, ty, ally, graph, test_index = [
            self.read_data(osp.join(self.data_root, "raw", name))
            for name in self.filenames
        ]
        train_index = np.arange(y.shape[0])
        val_index = np.arange(y.shape[0], y.shape[0] + 500)
        sorted_test_index = sorted(test_index)

        x = np.concatenate((allx, tx), axis=0)
        y = np.concatenate((ally, ty), axis=0).argmax(axis=1)

        x[test_index] = x[sorted_test_index]
        y[test_index] = y[sorted_test_index]
        num_nodes = x.shape[0]

        train_mask = np.zeros(num_nodes, dtype=np.bool)
        val_mask = np.zeros(num_nodes, dtype=np.bool)
        test_mask = np.zeros(num_nodes, dtype=np.bool)
        train_mask[train_index] = True
        val_mask[val_index] = True
        test_mask[test_index] = True
        adjacency = self.build_adjacency(graph)
        print("Node's feature shape: ", x.shape)
        print("Node's label shape: ", y.shape)
        print("Adjacency's shape: ", adjacency.shape)
        print("Number of training nodes: ", train_mask.sum())
        print("Number of validation nodes: ", val_mask.sum())
        print("Number of test nodes: ", test_mask.sum())

        return Data(x=x,
                    y=y,
                    adjacency=adjacency,
                    train_mask=train_mask,
                    val_mask=val_mask,
                    test_mask=test_mask)

    @staticmethod
    def build_adjacency(adj_dict):
        """
        根据邻接表创建邻接矩阵
        """
        edge_index = []
        num_nodes = len(adj_dict)
        for src, dst in adj_dict.items():
            edge_index.extend([src, v] for v in dst)
            edge_index.extend([v, src] for v in dst)
        # 去除重复的边
        edge_index = list(k for k, _ in itertools.groupby(sorted(edge_index)))
        edge_index = np.asarray(edge_index)
        adjacency = sp.coo_matrix(
            (np.ones(len(edge_index)), (edge_index[:, 0], edge_index[:, 1])),
            shape=(num_nodes, num_nodes),
            dtype="float32")
        return adjacency

    @staticmethod
    def read_data(path):
        """
        读取Cora原始数据文件
        """
        name = osp.basename(path)
        if name == "ind.Cora.test.index":
            out = np.genfromtxt(path, dtype="int64")
            return out
        else:
            out = pickle.load(open(path, "rb"), encoding="latin1")
            out = out.toarray() if hasattr(out, "toarray") else out
            return out

    @staticmethod
    def normalization(adjacency):
        """
        计算 L=D^-0.5 * (A+I) * D^-0.5
        """
        adjacency += sp.eye(adjacency.shape[0])  # 增加自连接
        degree = np.array(adjacency.sum(1))
        d_hat = sp.diags(np.power(degree, -0.5).flatten())
        return d_hat.dot(adjacency).dot(d_hat).tocoo()


In [58]:
ds = CoraData("../../GCN/Cora", rebuild=False).data

Using Cached file: ../../GCN/Cora/processed_cora.pkl


读取Cora数据集得到的向量shape

In [52]:
print(ds)
print('ds.x:',ds.x,ds.x.shape)  # (2708,1433)
print('ds.y:',ds.y,ds.y.shape)  # (2708,)
print('ds.adjacency:',ds.adjacency,ds.adjacency.shape)  # (2708,2708)
print('ds.train_mask:',ds.train_mask,ds.train_mask.shape)   # (2708,)
print('ds.val_mask:',ds.val_mask,ds.val_mask.shape)# (2708,)
print('ds.test_mask:',ds.test_mask,ds.test_mask.shape)# (2708,)
print('训练集掩码向量个数：',sum(ds.train_mask))
print('验证集掩码向量个数：',sum(ds.val_mask))
print('测试集掩码向量个数：',sum(ds.test_mask))

Data(x=array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), y=array([3, 4, 4, ..., 3, 3, 3]), adjacency=<2708x2708 sparse matrix of type '<class 'numpy.float32'>'
	with 10556 stored elements in COOrdinate format>, train_mask=array([ True,  True,  True, ..., False, False, False]), val_mask=array([False, False, False, ..., False, False, False]), test_mask=array([False, False, False, ...,  True,  True,  True]))
ds.x: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] (2708, 1433)
ds.y: [3 4 4 ... 3 3 3] (2708,)
ds.adjacency:   (0, 633)	1.0
  (0, 1862)	1.0
  (0, 2582)	1.0
  (1, 2)	1.0
  (1, 652)	1.0
  (1, 654)	1.0
  (2, 1)	1.0
  (2, 332)	1.0
  (2, 1454)	1.0
  (2, 1666)	1.0
  (2, 1986)	1

读取Cora数据集得到样本

1. x: 所有节点的特征，shape为(2708, 1433)
2. y: 所有节点的label，shape为(2708, )
3. adjacency: 所有节点的邻接矩阵，shape为(2708, 2708)，这里采用稀疏矩阵存储
4. train_mask: 训练集掩码向量，shape为(2708, )属于训练集的位置值为True，否则False，共140个
5. val_mask: 训练集掩码向量，shape为(2708, )属于验证集的位置值为True，否则False，共500个
6. test_mask: 训练集掩码向量，shape为(2708, )属于测试集的位置值为True，否则False，共1000个

In [18]:
DEVICE=torch.device('cuda:0' if torch.cuda.is_available() else "cpu")

In [19]:
dataset = CoraData("../../GCN/Cora", rebuild=False).data

Using Cached file: ../../GCN/Cora/processed_cora.pkl


In [24]:
node_feature = dataset.x / dataset.x.sum(dim=1, keepdims=True)  # 归一化数据，dim=0表示按照列相加，dim=1表示按照行相加，keepdims=True表示保留原始tensor维度
print(node_feature.shape)
print(node_feature)

(2708, 1433)
(2708, 1)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [27]:
tensor_x = torch.from_numpy(node_feature).to(DEVICE)
tensor_y = torch.from_numpy(dataset.y).to(DEVICE)
print(tensor_x.shape)   # (2708,1433)
print(tensor_y.shape)   # (2708)
print(tensor_y)

torch.Size([2708, 1433])
torch.Size([2708])
tensor([3, 4, 4,  ..., 3, 3, 3], device='cuda:0')


In [29]:
tensor_train_mask = torch.from_numpy(dataset.train_mask).to(DEVICE)
tensor_val_mask = torch.from_numpy(dataset.val_mask).to(DEVICE)
tensor_test_mask = torch.from_numpy(dataset.test_mask).to(DEVICE)
print(tensor_train_mask)  # (2708)
print(tensor_val_mask)# (2708)
print(tensor_test_mask)# (2708)

tensor([ True,  True,  True,  ..., False, False, False], device='cuda:0')
tensor([False, False, False,  ..., False, False, False], device='cuda:0')
tensor([False, False, False,  ...,  True,  True,  True], device='cuda:0')


In [41]:
normalize_adjacency = CoraData.normalization(dataset.adjacency)  # 规范化邻接矩阵
print(type(normalize_adjacency))    # 坐标格式的矩阵
print(normalize_adjacency)

<class 'scipy.sparse._coo.coo_matrix'>
  (0, 0)	0.25
  (0, 633)	0.25
  (0, 1862)	0.22360679774997896
  (0, 2582)	0.25
  (1, 1)	0.25
  (1, 2)	0.2041241452319315
  (1, 652)	0.28867513459481287
  (1, 654)	0.3535533905932738
  (2, 1)	0.2041241452319315
  (2, 2)	0.16666666666666666
  (2, 332)	0.16666666666666666
  (2, 1454)	0.2886751345948129
  (2, 1666)	0.1543033499620919
  (2, 1986)	0.050251890762960605
  (3, 3)	0.5000000000000001
  (3, 2544)	0.5000000000000001
  (4, 4)	0.16666666666666666
  (4, 1016)	0.16666666666666666
  (4, 1256)	0.13608276348795434
  (4, 1761)	0.14433756729740646
  (4, 2175)	0.16666666666666666
  (4, 2176)	0.13608276348795434
  (5, 5)	0.25
  (5, 1629)	0.25
  (5, 1659)	0.28867513459481287
  :	:
  (2699, 2699)	0.5000000000000001
  (2700, 1151)	0.408248290463863
  (2700, 2700)	0.5000000000000001
  (2701, 44)	0.28867513459481287
  (2701, 2624)	0.3333333333333333
  (2701, 2701)	0.3333333333333333
  (2702, 186)	0.2182178902359924
  (2702, 1536)	0.2581988897471611
  (2702, 2

scipy.sparse._coo.coo_matrix表示坐标格式的矩阵
```python
from scipy.sparse import coo_matrix
import numpy as np
row = np.array([1, 1, 3, 2]) # 行索引
col = np.array([0, 2, 2, 3]) # 列索引
data= np.array([5, 8, 4, 9]) # 索引对应的数值
coo = coo_matrix((data, (row, col)), shape=(4, 4)).todense()
#先看shape，表示这个稀疏矩阵是4x4大小的，所有值初始都为0，即4x4的全0矩阵
#(row, col)行、列组合就表示一个具体的位置，其(1,0),(1,2),(3,2),(2,3)就是4x4矩阵的索引位置。
#data,表示索引位置上的数值，即(1,0)上的数值为5，(1,2)上的数值为8，等等。
#todense,作用可以自己试试，如果没有这个函数，则输出如下结果
#  (1, 0)	5
#  (1, 2)	8
#  (3, 2)	4
#  (2, 3)	9
print(coo)
#打印出coo稀疏矩阵
```

In [45]:
num_nodes, input_dim = node_feature.shape
print(num_nodes,input_dim)
print(len(normalize_adjacency.row))
print(len(normalize_adjacency.col))

2708 1433
2707
2707


In [36]:
indices = torch.from_numpy(
    np.asarray([normalize_adjacency.row,
                normalize_adjacency.col]).astype('int64')).long()   # 将结构数据转化为ndarray
print(indices)    # (2,13264)   13264是指稀疏矩阵的大小
values = torch.from_numpy(normalize_adjacency.data.astype(np.float32))
print(values) # (13264)
tensor_adjacency = torch.sparse.FloatTensor(indices, values,
                                            (num_nodes, num_nodes)).to(DEVICE)
print(tensor_adjacency)# (2708,2708)    2708是指有索引的数值组成的矩阵大小

tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [   0,  633, 1862,  ..., 1473, 2706, 2707]])
tensor([0.2500, 0.2500, 0.2236,  ..., 0.2000, 0.2000, 0.2000])
tensor(indices=tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
                       [   0,  633, 1862,  ..., 1473, 2706, 2707]]),
       values=tensor([0.2500, 0.2500, 0.2236,  ..., 0.2000, 0.2000, 0.2000]),
       device='cuda:0', size=(2708, 2708), nnz=13264, layout=torch.sparse_coo)
