In [None]:
!mkdir /home/aistudio/external-libraries
!pip install pgl==1.2.0 easydict -q -t /home/aistudio/external-libraries

[31mERROR: blackhole 0.3.2 has requirement xgboost==1.1.0, but you'll have xgboost 1.3.3 which is incompatible.[0m


In [1]:
import sys 
sys.path.append('/home/aistudio/external-libraries')


In [2]:
import pgl
import paddle.fluid as fluid
import numpy as np
import time
import pandas as pd

In [3]:
# 训练600epoch acc=0.72
from easydict import EasyDict as edict
config = {
    "model_name": "GCN",
    "num_layers":  3,
    "dropout": 0.5,
    "learning_rate": 0.01,
    "weight_decay": 0.0005,
    "edge_dropout": 0.00,
}
config = edict(config)

In [4]:
from collections import namedtuple

Dataset = namedtuple("Dataset", 
               ["graph", "num_classes", "train_index",
                "train_label", "valid_index", "valid_label", "test_index"])

def load_edges(num_nodes, self_loop=True, add_inverse_edge=True):
    # 从数据中读取边
    edges = pd.read_csv("work/edges.csv", header=None, names=["src", "dst"]).values

    if add_inverse_edge:
        edges = np.vstack([edges, edges[:, ::-1]])

    if self_loop:
        src = np.arange(0, num_nodes)
        dst = np.arange(0, num_nodes)
        self_loop = np.vstack([src, dst]).T
        edges = np.vstack([edges, self_loop])
    
    return edges

def load():
    # 从数据中读取点特征和边，以及数据划分
    node_feat = np.load("work/feat.npy")
    num_nodes = node_feat.shape[0]
    edges = load_edges(num_nodes=num_nodes, self_loop=True, add_inverse_edge=True)
    graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges, node_feat={"feat": node_feat})
    
    indegree = graph.indegree()
    norm = np.maximum(indegree.astype("float32"), 1)
    norm = np.power(norm, -0.5)
    graph.node_feat["norm"] = np.expand_dims(norm, -1)
    
    df = pd.read_csv("work/train.csv")
    node_index = df["nid"].values
    node_label = df["label"].values
    train_part = int(len(node_index) * 0.8)
    train_index = node_index[:train_part]
    train_label = node_label[:train_part]
    valid_index = node_index[train_part:]
    valid_label = node_label[train_part:]
    test_index = pd.read_csv("work/test.csv")["nid"].values
    dataset = Dataset(graph=graph, 
                    train_label=train_label,
                    train_index=train_index,
                    valid_index=valid_index,
                    valid_label=valid_label,
                    test_index=test_index, num_classes=35)
    return dataset


In [5]:
dataset = load()

train_index = dataset.train_index
train_label = np.reshape(dataset.train_label, [-1 , 1])
train_index = np.expand_dims(train_index, -1)

val_index = dataset.valid_index
val_label = np.reshape(dataset.valid_label, [-1, 1])
val_index = np.expand_dims(val_index, -1)

test_index = dataset.test_index
test_index = np.expand_dims(test_index, -1)
test_label = np.zeros((len(test_index), 1), dtype="int64")



In [6]:
import pgl
import model
import paddle.fluid as fluid
import numpy as np
import time
from build_model import build_model

# place = fluid.CPUPlace()
# 使用GPU
place = fluid.CUDAPlace(0)

train_program = fluid.default_main_program()
startup_program = fluid.default_startup_program()
with fluid.program_guard(train_program, startup_program):
    with fluid.unique_name.guard():
        gw, loss, acc, pred = build_model(dataset,
                            config=config,
                            phase="train",
                            main_prog=train_program)

test_program = fluid.Program()
with fluid.program_guard(test_program, startup_program):
    with fluid.unique_name.guard():
        _gw, v_loss, v_acc, v_pred = build_model(dataset,
            config=config,
            phase="test",
            main_prog=test_program)


test_program = test_program.clone(for_test=True)

exe = fluid.Executor(place)


In [7]:
epoch = 400
exe.run(startup_program)

# 将图数据变成 feed_dict 用于传入Paddle Excecutor
feed_dict = gw.to_feed(dataset.graph)

for epoch in range(epoch):
    # Full Batch 训练
    # 设定图上面那些节点要获取
    # node_index: 训练节点的nid    
    # node_label: 训练节点对应的标签
    feed_dict["node_index"] = np.array(train_index, dtype="int64")
    feed_dict["node_label"] = np.array(train_label, dtype="int64")
    
    train_loss, train_acc = exe.run(train_program,
                                feed=feed_dict,
                                fetch_list=[loss, acc],
                                return_numpy=True)

    # Full Batch 验证
    # 设定图上面那些节点要获取
    # node_index: 训练节点的nid    
    # node_label: 训练节点对应的标签
    feed_dict["node_index"] = np.array(val_index, dtype="int64")
    feed_dict["node_label"] = np.array(val_label, dtype="int64")
    val_loss, val_acc = exe.run(test_program,
                            feed=feed_dict,
                            fetch_list=[v_loss, v_acc],
                            return_numpy=True)
    print("Epoch", epoch, "Train Acc", train_acc[0], "Valid Acc", val_acc[0])

Epoch 0 Train Acc 0.00840037 Valid Acc 0.17498398
Epoch 1 Train Acc 0.15065494 Valid Acc 0.14864384
Epoch 2 Train Acc 0.16136898 Valid Acc 0.14850146
Epoch 3 Train Acc 0.15670606 Valid Acc 0.14850146
Epoch 4 Train Acc 0.15336014 Valid Acc 0.3218481
Epoch 5 Train Acc 0.2042429 Valid Acc 0.18801168
Epoch 6 Train Acc 0.21556205 Valid Acc 0.18808286
Epoch 7 Train Acc 0.22052751 Valid Acc 0.2731544
Epoch 8 Train Acc 0.24955507 Valid Acc 0.3272585
Epoch 9 Train Acc 0.28735673 Valid Acc 0.30490497
Epoch 10 Train Acc 0.26802877 Valid Acc 0.29287392
Epoch 11 Train Acc 0.25040933 Valid Acc 0.32263118
Epoch 12 Train Acc 0.29881826 Valid Acc 0.32661778
Epoch 13 Train Acc 0.31777248 Valid Acc 0.31807503
Epoch 14 Train Acc 0.31663343 Valid Acc 0.31999716
Epoch 15 Train Acc 0.3230049 Valid Acc 0.33829287
Epoch 16 Train Acc 0.32942978 Valid Acc 0.36164305
Epoch 17 Train Acc 0.3531537 Valid Acc 0.35067987
Epoch 18 Train Acc 0.34336513 Valid Acc 0.33523172
Epoch 19 Train Acc 0.34267104 Valid Acc 0.33423

In [None]:
feed_dict["node_index"] = np.array(test_index, dtype="int64")
feed_dict["node_label"] = np.array(test_label, dtype="int64") #假标签
test_prediction = exe.run(test_program,
                            feed=feed_dict,
                            fetch_list=[v_pred],
                            return_numpy=True)[0]

In [None]:
submission = pd.DataFrame(data={
                            "nid": test_index.reshape(-1),
                            "label": test_prediction.reshape(-1)
                        })
submission.to_csv("submission.csv", index=False)