In [1]:
import os.path
import pickle
from collections import namedtuple
import itertools
import numpy as np
import scipy.sparse as sp



In [2]:
# 定义一个namedtuple类型Data，并包含[]属性
#数据集已经下载好，未导入pyG
Data=namedtuple("Data",['x','y','adjacency','train_mask','val_mask','test_mask'])

In [51]:
class CoraData(object):
    def __init__(self,data_root='../dataset/cora',rebuild=False):
        self.data_root=data_root
        self.filenames=[
            "ind.cora.{}".format(name) for name in [
                'x','tx','allx','y','ty','ally','graph','test.index'
            ]
        ]
        # ind.dataset_str.x => 训练实例的特征向量，是scipy.sparse.csr.csr_matrix类对象，shape:(140, 1433)
        # ind.dataset_str.tx => 测试实例的特征向量,shape:(1000, 1433)
        # ind.dataset_str.allx => 有标签的+无无标签训练实例的特征向量，是ind.dataset_str.x的超集，shape:(1708, 1433)
        # ind.dataset_str.y => 训练实例的标签，独热编码，numpy.ndarray类的实例，是numpy.ndarray对象，shape：(140, 7)
        # ind.dataset_str.ty => 测试实例的标签，独热编码，numpy.ndarray类的实例,shape:(1000, 7)
        # ind.dataset_str.ally => 对应于ind.dataset_str.allx的标签，独热编码,shape:(1708, 7)
        # ind.dataset_str.graph => 图数据，collections.defaultdict类的实例，格式为 {index：[index_of_neighbor_nodes]}
        # ind.dataset_str.test.index => 测试实例的id，2157行
        # 上述文件必须都用python的pickle模块存储

        save_file=os.path.join(self.data_root,"processed_cora.pkl")
        if os.path.exists(save_file) and not rebuild:
            print("Using Cached file:{}".format(save_file))
            #拆封数据-读档
            self._data=pickle.load(open(save_file,"rb"))
        else:
            self._data=self.process_data()
            with open(save_file,"wb") as f:
                #封装数据-存档
                pickle.dump(self.data,f)
            print("Cached file:{}".format(save_file))
    def data(self):
        return self._data
    def process_data(self):
        print("Processing data ...")
        x,tx,allx,y,ty,ally,graph,test_index=[
            self.read_data(os.path.join(self.data_root,"raw",name)) for name in self.filenames
        ]
        train_index=np.arange(y.shape[0])
        val_index=np.arange(y.shape[0],y.shape[0]+500)
        sorted_test_index=sorted(test_index)
        x=np.concatenate((allx,tx),axis=0)
        y=np.concatenate((ally,ty),axis=0).argmax(axis=1)
        #测试节点排序？
        x[test_index]=x[sorted_test_index]
        y[test_index]=y[sorted_test_index]
        num_nodes=x.shape[0]
        
        train_mask=np.zeros(num_nodes,dtype=np.bool)
        val_mask=np.zeros(num_nodes,dtype=np.bool)
        test_mask=np.zeros(num_nodes,dtype=np.bool)
        train_mask[train_index]=True
        val_mask[val_index]=True
        test_mask[test_index]=True
        
        adjacency=self.build_adjacency(graph)
        print("Node's feature shape: ", x.shape)
        print("Node's label shape: ", y.shape)
        print("Adjacency's shape: ", adjacency.shape)
        print("Number of training nodes: ", train_mask.sum())
        print("Number of validation nodes: ", val_mask.sum())
        print("Number of test nodes: ", test_mask.sum())
        
        return Data(x=x,y=y,adjacency=adjacency,train_mask=train_mask,val_mask=val_mask,test_mask=test_mask)
    
    def build_adjacency(self,adj_dict):
        #根据邻接表创建邻接矩阵
        print("adj_dict",adj_dict)
        edge_index=[]
        num_nodes=len(adj_dict)
        for src,dst in adj_dict.items():
            edge_index.extend([src,v] for v in dst)
            edge_index.extend([v,src] for v in dst)
        #去除列表中的重复元素
        print("edge_index",edge_index)
        edge_index=list(k for k,_ in itertools.groupby(sorted(edge_index)))
        edge_index=np.asarray(edge_index)
        print("edge_index",edge_index)
        #以稀疏矩阵方式存储和运算
        adjacency=sp.coo_matrix((np.ones(len(edge_index),edge_index[:,0],edge_index[:,1])),shape=(num_nodes,num_nodes),dtype="float32")
        return adjacency
    
    def read_data(path):
        name=os.path.basename(path)
        if name=="ind.cora.test.index":
            out=np.genfromtxt(path,dtype="int64")
        else:
            out=pickle.load(open(path,"rb"),encoding="latin1")
            out=out.toarray() if hasattr(out,"toarray") else out
        return out
    
    def normalization(adjacency):
        #计算拉普拉斯矩阵
        adjacency+=sp.eye(adjacency.shape[0])#增加自连接
        degree=np.array(adjacency.sum(axis=1))
        d_hat=sp.diags(np.power(degree,-0.5).flatten())
        return d_hat.dot(adjacency).dot(d_hat).tocoo()

In [17]:
y1=np.array([[1,0,0,0],[0,0,1,0]])
y2=np.array([[0,1,0,0],[0,0,0,1]])
y=np.concatenate([y1,y2],axis=0)
y=y.argmax(axis=1)
print(y)

[0 2 1 3]
