In [None]:
# 查看当前挂载的数据集目录, 该目录下的变更重启环境后会自动还原
# View dataset directory. 
# This directory will be recovered automatically after resetting environment. 
!ls /home/aistudio/data

data117595  data61620


In [None]:
# 查看工作区文件, 该目录下的变更将会持久保存. 请及时清理不必要的文件, 避免加载过慢.
# View personal work directory. 
# All changes under this directory will be kept even after reset. 
# Please clean unnecessary files in time to speed up environment loading. 
!ls /home/aistudio/work

edges.csv  feat.npy  graph  test.csv  train.csv  work


In [None]:
# 如果需要进行持久化安装, 需要使用持久化路径, 如下方代码示例:
# If a persistence installation is required, 
# you need to use the persistence path as the following: 
#!mkdir /home/aistudio/external-libraries
#!pip install pgl==1.2.0 easydict -q -t /home/aistudio/external-libraries

^C


In [2]:
!pip install pgl==1.2.0

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting pgl==1.2.0
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/35/fa/2290e78914d34d4e4480d7982b8f4d0c58a7e53535113a668a9d75d5c3b6/pgl-1.2.0-cp37-cp37m-manylinux1_x86_64.whl (7.9MB)
[K     |████████████████████████████████| 7.9MB 14.1MB/s eta 0:00:01
[?25hCollecting redis-py-cluster (from pgl==1.2.0)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/b2/96/153bbcf5dee29b52b2674e77a87ce864d381f72151737317529b7de4f337/redis_py_cluster-2.1.3-py2.py3-none-any.whl (42kB)
[K     |████████████████████████████████| 51kB 14.6MB/s eta 0:00:01
Collecting redis<4.0.0,>=3.0.0 (from redis-py-cluster->pgl==1.2.0)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a7/7c/24fb0511df653cf1a5d938d8f5d19802a88cef255706fdda242ff97e91b7/redis-3.5.3-py2.py3-none-any.whl (72kB)
[K     |████████████████████████████████| 81kB 3.3MB/s eta 0:00:011
Installing collected packages: redis, redis-py-cluster, pg

In [None]:
#!/usr/bin/env python
# coding: utf-8

# ## 代码整体逻辑
# 
# 1. 读取提供的数据集，包含构图以及读取节点特征（用户可自己改动边的构造方式）
# 
# 2. 配置化生成模型，用户也可以根据教程进行图神经网络的实现。
# 
# 3. 开始训练
# 
# 4. 执行预测并产生结果文件
# 

# ## 环境配置
# 
# 该项目依赖飞桨paddlepaddle==1.8.4, 以及pgl==1.2.0。请按照版本号下载对应版本就可运行。


import pgl
import paddle.fluid as fluid
import numpy as np
import time
import pandas as pd



from easydict import EasyDict as edict

config = {
    "model_name": "GATGCN",
    "num_layers": 2,
    "dropout": 0.0,
    "hidden_size":64,
    "learning_rate": 0.005,
    "weight_decay": 1e-8,
    "edge_dropout": 0.00,
}

# config = {
#     "model_name": "GCN",
#     "num_layers": 2,
#     "dropout": 0.2,
#     "hidden_size":512,
#     "learning_rate": 0.001,
#     "weight_decay": 0.00001,
#     "edge_dropout": 0.00,
# }

config = edict(config)


# ## 数据加载模块
# 
# 这里主要是用于读取数据集，包括读取图数据构图，以及训练集的划分。

# In[16]:


from collections import namedtuple

Dataset = namedtuple("Dataset", 
               ["graph", "num_classes", "train_index",
                "train_label", "valid_index", "valid_label", "test_index"])

def load_edges(num_nodes, self_loop=True, add_inverse_edge=True):
    # 从数据中读取边
    edges = pd.read_csv("work/edges.csv", header=None, names=["src", "dst"]).values

    if add_inverse_edge:
        edges = np.vstack([edges, edges[:, ::-1]])

    if self_loop:
        src = np.arange(0, num_nodes)
        dst = np.arange(0, num_nodes)
        self_loop = np.vstack([src, dst]).T
        edges = np.vstack([edges, self_loop])
    
    return edges

def load():
    # 从数据中读取点特征和边，以及数据划分
    node_feat = np.load("work/feat.npy")
    num_nodes = node_feat.shape[0]
    edges = load_edges(num_nodes=num_nodes, self_loop=True, add_inverse_edge=True)

    
    df = pd.read_csv("work/train.csv")
    node_index = df["nid"].values
    node_label = df["label"].values
    train_part = int(len(node_index) * 0.95)
    train_index = node_index[:train_part]
    train_label = node_label[:train_part]
    valid_index = node_index[train_part:]
    valid_label = node_label[train_part:]
    test_index = pd.read_csv("work/test.csv")["nid"].values


    feat_label=np.zeros(shape=[num_nodes,35])
    feat_label[node_index,node_label]=1


    graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges, node_feat={"feat": node_feat})
    indegree = graph.indegree()
    norm = np.maximum(indegree.astype("float32"), 1)
    norm = np.power(norm, -0.5)
    graph.node_feat["norm"] = np.expand_dims(norm, -1)

    dataset = Dataset(graph=graph, 
                    train_label=train_label,
                    train_index=train_index,
                    valid_index=valid_index,
                    valid_label=valid_label,
                    test_index=test_index, num_classes=35)
    return dataset


# In[17]:


dataset = load()

train_index = dataset.train_index
train_label = np.reshape(dataset.train_label, [-1 , 1])
train_index = np.expand_dims(train_index, -1)

val_index = dataset.valid_index
val_label = np.reshape(dataset.valid_label, [-1, 1])
val_index = np.expand_dims(val_index, -1)

test_index = dataset.test_index
test_index = np.expand_dims(test_index, -1)
test_label = np.zeros((len(test_index), 1), dtype="int64")


# ## 组网模块
# 
# 这里是组网模块，目前已经提供了一些预定义的模型，包括**GCN**, **GAT**, **APPNP**等。可以通过简单的配置，设定模型的层数，hidden_size等。你也可以深入到model.py里面，去奇思妙想，写自己的图神经网络。

# In[18]:


import pgl
import model
import paddle.fluid as fluid
import numpy as np
import time
from build_model import build_model

# 使用CPU
#place = fluid.CPUPlace()

# 使用GPU
place = fluid.CUDAPlace(0)

train_program = fluid.default_main_program()
startup_program = fluid.default_startup_program()
with fluid.program_guard(train_program, startup_program):
    with fluid.unique_name.guard():
        gw, loss, acc, pred = build_model(dataset,
                            config=config,
                            phase="train",
                            main_prog=train_program)

test_program = fluid.Program()
with fluid.program_guard(test_program, startup_program):
    with fluid.unique_name.guard():
        _gw, v_loss, v_acc, v_pred = build_model(dataset,
            config=config,
            phase="test",
            main_prog=test_program)


test_program = test_program.clone(for_test=True)

exe = fluid.Executor(place)


# ## 开始训练过程
# 
# 图神经网络采用FullBatch的训练方式，每一步训练就会把所有整张图训练样本全部训练一遍。
# 

epoch = 500
exe.run(startup_program)

# 将图数据变成 feed_dict 用于传入Paddle Excecutor
feed_dict = gw.to_feed(dataset.graph)
best_acc=0
for epoch in range(epoch):
    # Full Batch 训练
    # 设定图上面那些节点要获取
    # node_index: 训练节点的nid    
    # node_label: 训练节点对应的标签
    feed_dict["node_index"] = np.array(train_index, dtype="int64")
    feed_dict["node_label"] = np.array(train_label, dtype="int64")
    
    train_loss, train_acc = exe.run(train_program,
                                feed=feed_dict,
                                fetch_list=[loss, acc],
                                return_numpy=True)

    # Full Batch 验证
    # 设定图上面那些节点要获取
    # node_index: 训练节点的nid    
    # node_label: 训练节点对应的标签
    feed_dict["node_index"] = np.array(val_index, dtype="int64")
    feed_dict["node_label"] = np.array(val_label, dtype="int64")
    val_loss, val_acc = exe.run(test_program,
                            feed=feed_dict,
                            fetch_list=[v_loss, v_acc],
                            return_numpy=True)
    print("Epoch", epoch, "Train Acc", train_acc[0], "Valid Acc", val_acc[0])
    if val_acc[0]>best_acc:
        best_acc=val_acc[0]
        if epoch>100:
            # ## 对测试集进行预测
            # 
            # 训练完成后，我们对测试集进行预测。预测的时候，由于不知道测试集合的标签，我们随意给一些测试label。最终我们获得测试数据的预测结果。

            feed_dict["node_index"] = np.array(test_index, dtype="int64")
            feed_dict["node_label"] = np.array(test_label, dtype="int64") #假标签
            test_prediction = exe.run(test_program,
                                        feed=feed_dict,
                                        fetch_list=[v_pred],
                                        return_numpy=True)[0]

            # ## 生成提交文件
            # 
            # 最后一步，我们可以使用pandas轻松生成提交文件，最后下载 submission.csv 提交就好了。

            submission = pd.DataFrame(data={
                                        "nid": test_index.reshape(-1),
                                        "label": test_prediction.reshape(-1)
                                    })
            submission.to_csv("submission.csv", index=False)






W1219 15:23:51.065299   119 device_context.cc:252] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.1, Runtime API Version: 9.0
W1219 15:23:51.069478   119 device_context.cc:260] device: 0, cuDNN Version: 7.6.


Epoch 0 Train Acc 0.07977759 Valid Acc 0.20330296
Epoch 1 Train Acc 0.19939151 Valid Acc 0.1452164
Epoch 2 Train Acc 0.14774516 Valid Acc 0.2164009
Epoch 3 Train Acc 0.21568274 Valid Acc 0.15888382
Epoch 4 Train Acc 0.15146202 Valid Acc 0.19419134
Epoch 5 Train Acc 0.19276711 Valid Acc 0.2747722
Epoch 6 Train Acc 0.27154055 Valid Acc 0.3129271
Epoch 7 Train Acc 0.31627774 Valid Acc 0.32004556
Epoch 8 Train Acc 0.32583967 Valid Acc 0.37841687
Epoch 9 Train Acc 0.37511802 Valid Acc 0.2844533
Epoch 10 Train Acc 0.2827061 Valid Acc 0.37158313
Epoch 11 Train Acc 0.3612997 Valid Acc 0.34197038
Epoch 12 Train Acc 0.3388037 Valid Acc 0.3638952
Epoch 13 Train Acc 0.3662605 Valid Acc 0.4034738
Epoch 14 Train Acc 0.40512267 Valid Acc 0.43394077


请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 