In [1]:
import pandas as pd
import numpy as np

from ctypes import CDLL
import os

## add dll for dgl
tem = CDLL(r"f:/anaconda3/envs/deep-learn/Lib/site-packages/dgl/dgl.dll", winmode=0)
os.add_dll_directory(r"f:/anaconda3/envs/deep-learn/Lib/site-packages/dgl/dgl.dll")

import dgl
import dgl.nn as dglnn
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics.functional as MF
import tqdm
from dgl.data import AsNodePredDataset
from dgl.dataloading import (
    DataLoader,
    MultiLayerFullNeighborSampler,
    NeighborSampler,
)

# import LabelPropagation
from gtrick.dgl import LabelPropagation

In [4]:
# Generate a dataset suitable for DGL

# edges = []
# for index, row in data.iterrows():
#     c = row["node_id"]
#     neighbour = eval(row["neighbour"])
#     for n in neighbour:
#         edges.append([c,n])
        
# pd.DataFrame(edges).to_csv("data/edges.csv",index=None,header=None)

In [5]:
# create dgl.graph
data = pd.read_csv("data/Children.csv",sep=",")
data["label"] = data["label"].fillna(-1)

edges = pd.read_csv("data/edges.csv",header=None)
g = dgl.graph((edges.iloc[:,0], edges.iloc[:,1]))
g.ndata["label"] = torch.from_numpy(data["label"].to_numpy()).long()

train_val_mask = g.ndata["label"] != -1
test_mask = g.ndata["label"] == -1
train_val_idx = torch.nonzero(train_val_mask).squeeze()

train_ratio = 0.8
val_ratio = 1.0 - train_ratio
train_size = int(train_ratio * train_val_idx.shape[0])
val_size = train_val_idx.shape[0] - train_size

random_indices = torch.randperm(train_val_idx.shape[0])
train_idx = train_val_idx[random_indices[:train_size]]
val_idx = train_val_idx[random_indices[train_size:]]

train_mask = torch.zeros_like(g.ndata["label"])
train_mask[train_idx] = True

val_mask = torch.zeros_like(g.ndata["label"])
val_mask[val_idx] = True

g.ndata["train_mask"] = train_mask
g.ndata["val_mask"] = val_mask
g.ndata["test_mask"] = test_mask
g.ndata["feat"] = torch.tensor(torch.load("data/bert-cls-embeddings.pth"))

In [12]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

train_idx = torch.nonzero(g.ndata["train_mask"]).squeeze()
val_idx = torch.nonzero(g.ndata["val_mask"]).squeeze()
test_idx = torch.nonzero(g.ndata["test_mask"]).squeeze()

train_X, val_X, train_y, val_y = g.ndata["feat"][train_idx], g.ndata["feat"][val_idx], g.ndata["label"][train_idx], g.ndata["label"][val_idx], 

lightgbm

In [13]:
lgb_params = {'n_estimators': 300,'learning_rate': 0.1,'class_weight': "balanced",
            'reg_alpha': 0.25, 'reg_lambda': 0.2,
            'subsample': 0.8, 'colsample_bytree': 0.7,
            'max_depth': 4, 'num_leaves': 4,
            'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc'}

lgb_model = LGBMClassifier()
lgb_model.fit(train_X, train_y, eval_set=[(val_X, val_y)])
pred_y = lgb_model.predict_proba(val_X)


[1]	valid_0's multi_logloss: 1.31387
[2]	valid_0's multi_logloss: 1.18682
[3]	valid_0's multi_logloss: 1.07347
[4]	valid_0's multi_logloss: 1.00416
[5]	valid_0's multi_logloss: 0.948575
[6]	valid_0's multi_logloss: 0.904282
[7]	valid_0's multi_logloss: 0.90139
[8]	valid_0's multi_logloss: 0.925054
[9]	valid_0's multi_logloss: 0.893115
[10]	valid_0's multi_logloss: 0.910852
[11]	valid_0's multi_logloss: 0.871206
[12]	valid_0's multi_logloss: 0.891732
[13]	valid_0's multi_logloss: 0.91558
[14]	valid_0's multi_logloss: 0.947161
[15]	valid_0's multi_logloss: 0.95021
[16]	valid_0's multi_logloss: 1.0515
[17]	valid_0's multi_logloss: 1.13138
[18]	valid_0's multi_logloss: 1.27464
[19]	valid_0's multi_logloss: 1.20725
[20]	valid_0's multi_logloss: 1.54218
[21]	valid_0's multi_logloss: 1.69283
[22]	valid_0's multi_logloss: 1.54685
[23]	valid_0's multi_logloss: 1.5841
[24]	valid_0's multi_logloss: 2.12095
[25]	valid_0's multi_logloss: 1.96487
[26]	valid_0's multi_logloss: 2.40823
[27]	valid_0's 

In [23]:
accuracy_score(torch.argmax(torch.tensor(pred_y),dim=1), g.ndata["label"][val_idx])

0.7811382113821138

tensor([2, 1, 1,  ..., 2, 0, 0])

graphsage

In [4]:
class SAGE(nn.Module):
    def __init__(self, in_size, hid_size, out_size):
        super().__init__()
        self.layers = nn.ModuleList()
        self.layers.append(dglnn.SAGEConv(in_size, hid_size, "mean"))
        self.layers.append(dglnn.SAGEConv(hid_size, hid_size, "mean"))
        self.layers.append(dglnn.SAGEConv(hid_size, out_size, "mean"))
        self.dropout = nn.Dropout(0.3)
        self.hid_size = hid_size
        self.out_size = out_size

    def forward(self, blocks, x):
        h = x
        for l, (layer, block) in enumerate(zip(self.layers, blocks)):
            h = layer(block, h)
            if l != len(self.layers) - 1:
                h = F.relu(h)
                h = self.dropout(h)
        return h

    def inference(self, g, device, batch_size):
        feat = g.ndata["feat"]
        sampler = MultiLayerFullNeighborSampler(1, prefetch_node_feats=["feat"])
        dataloader = DataLoader(
            g,
            torch.arange(g.num_nodes()).to(g.device),
            sampler,
            device=device,
            batch_size=batch_size,
            shuffle=False,
            drop_last=False,
            num_workers=0,
        )
        buffer_device = torch.device("cpu")
        pin_memory = buffer_device != device

        for l, layer in enumerate(self.layers):
            y = torch.empty(
                g.num_nodes(),
                self.hid_size if l != len(self.layers) - 1 else self.out_size,
                dtype=feat.dtype,
                device=buffer_device,
                pin_memory=pin_memory,
            )
            feat = feat.to(device)
            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
                x = feat[input_nodes]
                h = layer(blocks[0], x)
                if l != len(self.layers) - 1:
                    h = F.relu(h)
                    h = self.dropout(h)
                y[output_nodes[0] : output_nodes[-1] + 1] = h.to(buffer_device)
            feat = y
        return y


def evaluate(model, graph, dataloader, num_classes):
    model.eval()
    ys = []
    y_hats = []
    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
        with torch.no_grad():
            x = blocks[0].srcdata["feat"]
            ys.append(blocks[-1].dstdata["label"])
            y_hats.append(model(blocks, x))

    # label propagation
    # lp_layers, lp_alpah = 50, 0.9
    # lp = LabelPropagation(lp_layers, lp_alpah)
    # yh = lp(graph, graph.ndata["label"], mask=graph.ndata["train_mask"])

    return MF.accuracy(
        torch.cat(y_hats),
        torch.cat(ys),
        task="multiclass",
        num_classes=num_classes,
    )


def layerwise_infer(device, graph, model, num_classes, batch_size):
    test_idx = torch.nonzero(g.ndata["test_mask"]).squeeze().to(device)
    model.eval()
    with torch.no_grad():
        pred = model.inference(graph, device, batch_size)
        return pred
        
def train(device, g, model, num_classes):
    train_idx = torch.nonzero(g.ndata["train_mask"]).squeeze().to(device)
    val_idx = torch.nonzero(g.ndata["val_mask"]).squeeze().to(device)
    sampler = NeighborSampler(
        [20, 20, 20], 
        prefetch_node_feats=["feat"],
        prefetch_labels=["label"],
    )
    train_dataloader = DataLoader(
        g,
        train_idx,
        sampler,
        device=device,
        batch_size=1024,
        shuffle=True,
        drop_last=False,
        num_workers=0,
    )
    val_dataloader = DataLoader(
        g,
        val_idx,
        sampler,
        device=device,
        batch_size=1024,
        shuffle=True,
        drop_last=False,
        num_workers=0,
    )

    opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)

    for epoch in range(10):
        model.train()
        total_loss = 0
        for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
            x = blocks[0].srcdata["feat"]
            y = blocks[-1].dstdata["label"]
            y_hat = model(blocks, x)
            loss = F.cross_entropy(y_hat, y)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total_loss += loss.item()
        acc = evaluate(model, g, val_dataloader, num_classes)
        print(
            "Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
                epoch, total_loss / (it + 1), acc.item()
            )
        )

In [None]:
g = g.to("cpu")
num_classes = 24
device = torch.device("cpu")

in_size = g.ndata["feat"].shape[1]
out_size = num_classes
model = SAGE(in_size, 800, out_size).to(device)

print("Training...")
train(device, g, model, num_classes)

In [5]:
# test the model
print("Testing...")
pred = layerwise_infer(
    device, g, model, num_classes, batch_size=4096
)

Testing...


100%|██████████| 19/19 [00:03<00:00,  6.31it/s]
100%|██████████| 19/19 [00:01<00:00, 12.16it/s]
100%|██████████| 19/19 [00:00<00:00, 19.58it/s]


In [15]:
pred_label = torch.argmax(pred,dim=1)
test_idx = torch.nonzero(g.ndata["test_mask"]).squeeze()

result = pd.DataFrame({
    "node_id":test_idx,
    "label":pred_label[test_idx]
})

result.to_csv("result/submission.csv",index=None)