In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from dgl.utils import expand_as_pair
from dgl import function as fn
from dgl.base import DGLError
from dgl.nn.functional import edge_softmax
import numpy as np
cat_features = ["Target",
                "Type",
                "Location"]


class PosEncoding(nn.Module):

    def __init__(self, dim, device, base=10000, bias=0):

        super(PosEncoding, self).__init__()
        """
        Initialize the posencoding component
        :param dim: the encoding dimension 
		:param device: where to train model
		:param base: the encoding base
		:param bias: the encoding bias
        """
        p = []
        sft = []
        for i in range(dim):
            b = (i - i % 2) / dim
            p.append(base ** -b)
            if i % 2:
                sft.append(np.pi / 2.0 + bias)
            else:
                sft.append(bias)
        self.device = device
        self.sft = torch.tensor(
            sft, dtype=torch.float32).view(1, -1).to(device)
        self.base = torch.tensor(p, dtype=torch.float32).view(1, -1).to(device)

    def forward(self, pos):
        with torch.no_grad():
            if isinstance(pos, list):
                pos = torch.tensor(pos, dtype=torch.float32).to(self.device)
            pos = pos.view(-1, 1)
            x = pos / self.base + self.sft
            return torch.sin(x)






class TransEmbedding(nn.Module):

    def __init__(self, df=None, device='cpu', dropout=0.2, in_feats=82, cat_features=None):
        """
        Initialize the attribute embedding and feature learning compoent

        :param df: the feature
                :param device: where to train model
                :param dropout: the dropout rate
                :param in_feat: the shape of input feature in dimension 1
                :param cat_feature: category features
        """
        super(TransEmbedding, self).__init__()
        self.time_pe = PosEncoding(dim=in_feats, device=device, base=100)
        #time_emb = time_pe(torch.sin(torch.tensor(df['time_span'].values)/86400*torch.pi))
        self.cat_table = nn.ModuleDict({col: nn.Embedding(max(df[col].unique(
        ))+1, in_feats).to(device) for col in cat_features if col not in {"Labels", "Time"}})
        self.label_table = nn.Embedding(3, in_feats, padding_idx=2).to(device)
        self.time_emb = None
        self.emb_dict = None
        self.label_emb = None
        self.cat_features = cat_features
        self.forward_mlp = nn.ModuleList(
            [nn.Linear(in_feats, in_feats) for i in range(len(cat_features))])
        self.dropout = nn.Dropout(dropout)

    def forward_emb(self, df):
        if self.emb_dict is None:
            self.emb_dict = self.cat_table
        # print(self.emb_dict)
        # print(df['trans_md'])
        support = {col: self.emb_dict[col](
            df[col]) for col in self.cat_features if col not in {"Labels", "Time"}}
        #self.time_emb = self.time_pe(torch.sin(torch.tensor(df['time_span'])/86400*torch.pi))
        #support['time_span'] = self.time_emb
        #support['labels'] = self.label_table(df['labels'])
        return support

    def forward(self, df):
        support = self.forward_emb(df)
        output = 0
        for i, k in enumerate(support.keys()):
            # if k =='time_span':
            #    print(df[k].shape)
            support[k] = self.dropout(support[k])
            support[k] = self.forward_mlp[i](support[k])
            output = output + support[k]
        return output


class TransformerConv(nn.Module):

    def __init__(self,
                 in_feats,
                 out_feats,
                 num_heads,
                 bias=True,
                 allow_zero_in_degree=False,
                 # feat_drop=0.6,
                 # attn_drop=0.6,
                 skip_feat=True,
                 gated=True,
                 layer_norm=True,
                 activation=nn.PReLU()):
        """
        Initialize the transformer layer.
        Attentional weights are jointly optimized in an end-to-end mechanism with graph neural networks and fraud detection networks.
            :param in_feat: the shape of input feature
            :param out_feats: the shape of output feature
            :param num_heads: the number of multi-head attention 
            :param bias: whether to use bias
            :param allow_zero_in_degree: whether to allow zero in degree
            :param skip_feat: whether to skip some feature 
            :param gated: whether to use gate
            :param layer_norm: whether to use layer regularization
            :param activation: the type of activation function   
        """

        super(TransformerConv, self).__init__()
        self._in_src_feats, self._in_dst_feats = expand_as_pair(in_feats)
        self._out_feats = out_feats
        self._allow_zero_in_degree = allow_zero_in_degree
        self._num_heads = num_heads

        self.lin_query = nn.Linear(
            self._in_src_feats, self._out_feats*self._num_heads, bias=bias)
        self.lin_key = nn.Linear(
            self._in_src_feats, self._out_feats*self._num_heads, bias=bias)
        self.lin_value = nn.Linear(
            self._in_src_feats, self._out_feats*self._num_heads, bias=bias)

        #self.feat_dropout = nn.Dropout(p=feat_drop)
        #self.attn_dropout = nn.Dropout(p=attn_drop)
        if skip_feat:
            self.skip_feat = nn.Linear(
                self._in_src_feats, self._out_feats*self._num_heads, bias=bias)
        else:
            self.skip_feat = None
        if gated:
            self.gate = nn.Linear(
                3*self._out_feats*self._num_heads, 1, bias=bias)
        else:
            self.gate = None
        if layer_norm:
            self.layer_norm = nn.LayerNorm(self._out_feats*self._num_heads)
        else:
            self.layer_norm = None
        self.activation = activation

    def forward(self, graph, feat, get_attention=False):
        """
        Description: Transformer Graph Convolution
        :param graph: input graph
            :param feat: input feat
            :param get_attention: whether to get attention
        """

        graph = graph.local_var()

        if not self._allow_zero_in_degree:
            if (graph.in_degrees() == 0).any():
                raise DGLError('There are 0-in-degree nodes in the graph, '
                               'output for those nodes will be invalid. '
                               'This is harmful for some applications, '
                               'causing silent performance regression. '
                               'Adding self-loop on the input graph by '
                               'calling `g = dgl.add_self_loop(g)` will resolve '
                               'the issue. Setting ``allow_zero_in_degree`` '
                               'to be `True` when constructing this module will '
                               'suppress the check and let the code run.')

        # check if feat is a tuple
        if isinstance(feat, tuple):
            h_src = feat[0]
            h_dst = feat[1]
        else:
            h_src = feat
            h_dst = h_src[:graph.number_of_dst_nodes()]

        # Step 0. q, k, v
        q_src = self.lin_query(
            h_src).view(-1, self._num_heads, self._out_feats)
        k_dst = self.lin_key(h_dst).view(-1, self._num_heads, self._out_feats)
        v_src = self.lin_value(
            h_src).view(-1, self._num_heads, self._out_feats)
        # Assign features to nodes
        graph.srcdata.update({'ft': q_src, 'ft_v': v_src})
        graph.dstdata.update({'ft': k_dst})
        # Step 1. dot product
        graph.apply_edges(fn.u_dot_v('ft', 'ft', 'a'))

        # Step 2. edge softmax to compute attention scores
        graph.edata['sa'] = edge_softmax(
            graph, graph.edata['a'] / self._out_feats**0.5)

        # Step 3. Broadcast softmax value to each edge, and aggregate dst node
        graph.update_all(fn.u_mul_e('ft_v', 'sa', 'attn'),
                         fn.sum('attn', 'agg_u'))

        # output results to the destination nodes
        rst = graph.dstdata['agg_u'].reshape(-1,
                                             self._out_feats*self._num_heads)

        if self.skip_feat is not None:
            skip_feat = self.skip_feat(feat[:graph.number_of_dst_nodes()])
            if self.gate is not None:
                gate = torch.sigmoid(
                    self.gate(
                        torch.concat([skip_feat, rst, skip_feat - rst], dim=-1)))
                rst = gate * skip_feat + (1 - gate) * rst
            else:
                rst = skip_feat + rst

        if self.layer_norm is not None:
            rst = self.layer_norm(rst)

        if self.activation is not None:
            rst = self.activation(rst)

        if get_attention:
            return rst, graph.edata['sa']
        else:
            return rst


class GraphAttnModel(nn.Module):
    def __init__(self,
                 in_feats,
                 hidden_dim,
                 n_layers,
                 n_classes,
                 heads,
                 activation,
                 skip_feat=True,
                 gated=True,
                 layer_norm=True,
                 post_proc=True,
                 n2v_feat=True,
                 drop=None,
                 ref_df=None,
                 cat_features=None,
                 nei_features=None,
                 device='cpu'):
        """
        Initialize the GTAN-GNN model
        :param in_feats: the shape of input feature
                :param hidden_dim: model hidden layer dimension
                :param n_layers: the number of GTAN layers
                :param n_classes: the number of classification
                :param heads: the number of multi-head attention 
                :param activation: the type of activation function
                :param skip_feat: whether to skip some feature
                :param gated: whether to use gate
        :param layer_norm: whether to use layer regularization
                :param post_proc: whether to use post processing
                :param n2v_feat: whether to use n2v features
        :param drop: whether to use drop
                :param ref_df: whether to refer other node features
                :param cat_features: category features
                :param nei_features: neighborhood statistic features
        :param device: where to train model
        """

        super(GraphAttnModel, self).__init__()
        self.in_feats = in_feats
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.n_classes = n_classes
        self.heads = heads
        self.activation = activation
        #self.input_drop = lambda x: x
        self.input_drop = nn.Dropout(drop[0])
        self.drop = drop[1]
        self.output_drop = nn.Dropout(self.drop)
        # self.pn = PairNorm(mode=pairnorm)
        if n2v_feat:
            self.n2v_mlp = TransEmbedding(
                ref_df, device=device, in_feats=in_feats, cat_features=cat_features)
        else:
            self.n2v_mlp = lambda x: x
        self.layers = nn.ModuleList()
        self.layers.append(nn.Embedding(
            n_classes+1, in_feats, padding_idx=n_classes))
        self.layers.append(
            nn.Linear(self.in_feats, self.hidden_dim*self.heads[0]))
        self.layers.append(
            nn.Linear(self.in_feats, self.hidden_dim*self.heads[0]))
        self.layers.append(nn.Sequential(nn.BatchNorm1d(self.hidden_dim*self.heads[0]),
                                         nn.PReLU(),
                                         nn.Dropout(self.drop),
                                         nn.Linear(self.hidden_dim *
                                                   self.heads[0], in_feats)
                                         ))

        # build multiple layers
        self.layers.append(TransformerConv(in_feats=self.in_feats,
                                           out_feats=self.hidden_dim,
                                           num_heads=self.heads[0],
                                           skip_feat=skip_feat,
                                           gated=gated,
                                           layer_norm=layer_norm,
                                           activation=self.activation))

        for l in range(0, (self.n_layers - 1)):
            # due to multi-head, the in_dim = num_hidden * num_heads
            self.layers.append(TransformerConv(in_feats=self.hidden_dim * self.heads[l - 1],
                                               out_feats=self.hidden_dim,
                                               num_heads=self.heads[l],
                                               skip_feat=skip_feat,
                                               gated=gated,
                                               layer_norm=layer_norm,
                                               activation=self.activation))
        if post_proc:
            self.layers.append(nn.Sequential(nn.Linear(self.hidden_dim * self.heads[-1], self.hidden_dim * self.heads[-1]),
                                             nn.BatchNorm1d(
                                                 self.hidden_dim * self.heads[-1]),
                                             nn.PReLU(),
                                             nn.Dropout(self.drop),
                                             nn.Linear(self.hidden_dim * self.heads[-1], self.n_classes)))
        else:
            self.layers.append(nn.Linear(self.hidden_dim *
                               self.heads[-1], self.n_classes))

    def forward(self, blocks, features, labels, n2v_feat=None):
        """
        :param blocks: train blocks
        :param features: train features  (|input|, feta_dim)
        :param labels: train labels (|input|, )
        :param n2v_feat: whether to use n2v features 
        """

        if n2v_feat is None:
            h = features
        else:
            h = self.n2v_mlp(n2v_feat)
            h = features + h

        label_embed = self.input_drop(self.layers[0](labels))
        label_embed = self.layers[1](h) + self.layers[2](label_embed)
        label_embed = self.layers[3](label_embed)
        h = h + label_embed  # residual

        for l in range(self.n_layers):
            h = self.output_drop(self.layers[l+4](blocks[l], h))

        logits = self.layers[-1](h)

        return logits


In [35]:
import torch
import torch.nn as nn

# 假设分类任务有 3 个类别
n_classes = 3
in_feats = 8
padding_idx = n_classes

# 创建嵌入层
embedding = nn.Embedding(n_classes + 1, in_feats, padding_idx=padding_idx)

# 示例输入标签，包含填充索引
labels = torch.tensor([0, 1, 2, 3])  # 这里的 3 是填充索引

# 进行嵌入操作
label_embed = embedding(labels)

print("嵌入向量的形状:", label_embed.shape)
print("填充索引对应的嵌入向量:", label_embed[-1])

print(label_embed)

嵌入向量的形状: torch.Size([4, 8])
填充索引对应的嵌入向量: tensor([0., 0., 0., 0., 0., 0., 0., 0.], grad_fn=<SelectBackward0>)
tensor([[ 2.2289,  1.2767,  1.0011, -1.3499,  0.3960,  1.4382,  0.3253, -0.5629],
        [-0.9021,  0.8298,  1.6002,  0.5777, -0.9464,  0.5399,  0.6111,  1.4507],
        [-0.2650,  0.4222,  0.3282, -0.0244, -0.4252, -0.4598, -0.4068,  0.3397],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],
       grad_fn=<EmbeddingBackward0>)


In [None]:
import copy


class early_stopper(object):
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Initialize the early stopper
        :param patience: the maximum number of rounds tolerated
        :param verbose: whether to stop early
        :param delta: the regularization factor
        """
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.best_value = None
        self.best_cv = None
        self.is_earlystop = False
        self.count = 0
        self.best_model = None
        # self.val_preds = []
        # self.val_logits = []

    def earlystop(self, loss, model=None):  # , preds, logits):
        """
        :param loss: the loss score on validation set
        :param model: the model
        """
        value = -loss
        cv = loss
        # value = ap

        if self.best_value is None:
            self.best_value = value
            self.best_cv = cv
            self.best_model = copy.deepcopy(model).to('cpu')
            # self.val_preds = preds
            # self.val_logits = logits
        elif value < self.best_value + self.delta:
            self.count += 1
            if self.verbose:
                print('EarlyStoper count: {:02d}'.format(self.count))
            if self.count >= self.patience:
                self.is_earlystop = True
        else:
            self.best_value = value
            self.best_cv = cv
            self.best_model = copy.deepcopy(model).to('cpu')
            # self.val_preds = preds
            # self.val_logits = logits
            self.count = 0


In [None]:
import copy


def load_lpa_subtensor(node_feat, work_node_feat, labels, seeds, input_nodes, device):
    batch_inputs = node_feat[input_nodes].to(device)
    batch_work_inputs = {i: work_node_feat[i][input_nodes].to(
        device) for i in work_node_feat if i not in {"Labels"}}
    # for i in batch_work_inputs:
    #    print(batch_work_inputs[i].shape)
    batch_labels = labels[seeds].to(device)
    train_labels = copy.deepcopy(labels)
    propagate_labels = train_labels[input_nodes]
    propagate_labels[:seeds.shape[0]] = 2
    return batch_inputs, batch_work_inputs, batch_labels, propagate_labels.to(device)


In [None]:
import numpy as np
import dgl
import torch
import os
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score
import torch.optim as optim
from scipy.io import loadmat
import pandas as pd
import pickle
from sklearn.model_selection import StratifiedKFold, train_test_split
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, QuantileTransformer
from dgl.dataloading import MultiLayerFullNeighborSampler
from dgl.dataloading import NodeDataLoader
from torch.optim.lr_scheduler import MultiStepLR
def gtan_main(feat_df, graph, train_idx, test_idx, labels, args, cat_features):
    print('-------------------------------------')
    device = args['device'] # 'cuda:0' if torch.cuda.is_available() else 'cpu'
    graph = graph.to(device) # move graph to device
    oof_predictions = torch.from_numpy(
        np.zeros([len(feat_df), 2])).float().to(device) 
    #oof_predictions 用于存储每个折叠（fold）中验证集样本的预测结果。
    # 每次训练一个模型并在验证集上进行预测时，预测结果会填充到 oof_predictions 中对应的位置。
    # 这样，在所有折叠的训练和验证完成后，oof_predictions 就包含了整个训练集在交叉验证过程中的所有袋外预测结果。
    # 这些预测结果可以用于评估模型在训练集上的性能
    test_predictions = torch.from_numpy(
        np.zeros([len(feat_df), 2])).float().to(device)


    kfold = StratifiedKFold(
        n_splits=args['n_fold'], shuffle=True, random_state=args['seed'])


    y_target = labels.iloc[train_idx].values
    print()
    print(feat_df.head())
    print()
    print(feat_df.values)
    print()
    num_feat = torch.from_numpy(feat_df.values).float().to(device) 
    cat_feat = {col: torch.from_numpy(feat_df[col].values).long().to(
        device) for col in cat_features}
    
    
    y = labels

    labels = torch.from_numpy(y.values).long().to(device)
    loss_fn = nn.CrossEntropyLoss().to(device)

    for fold, (trn_idx, val_idx) in enumerate(kfold.split(feat_df.iloc[train_idx], y_target)):
        print(f'Training fold {fold + 1}')
        trn_ind, val_ind = torch.from_numpy(np.array(train_idx)[trn_idx]).long().to(
            device), torch.from_numpy(np.array(train_idx)[val_idx]).long().to(device)

        train_sampler = MultiLayerFullNeighborSampler(args['n_layers'])
        train_dataloader = NodeDataLoader(graph,# 图数据
                                          trn_ind, # 训练集节点索引
                                          train_sampler, # 采样器
                                          device=device, # 设备
                                          use_ddp=False, # 是否使用分布式训练
                                          batch_size=args['batch_size'], # 批大小
                                          shuffle=True, # 是否打乱数据
                                          drop_last=False, # 是否丢弃最后一批
                                          num_workers=0 # 工作线程数
                                          )
        val_sampler = MultiLayerFullNeighborSampler(args['n_layers'])
        val_dataloader = NodeDataLoader(graph, # 图数据
                                        val_ind, # 验证集节点索引
                                        val_sampler, # 采样器
                                        use_ddp=False, # 是否使用分布式训练
                                        device=device, # 设备
                                        batch_size=args['batch_size'], # 批大小
                                        shuffle=True, # 是否打乱数据
                                        drop_last=False, # 是否丢弃最后一批
                                        num_workers=0 # 工作线程数
                                        )
        # TODO
        model = GraphAttnModel(in_feats=feat_df.shape[1],
                               # 为什么要整除4？
                               hidden_dim=args['hid_dim']//4,
                               n_classes=2,
                               heads=[4]*args['n_layers'],  # [4,4,4]
                               activation=nn.PReLU(),
                               n_layers=args['n_layers'],
                               drop=args['dropout'],
                               device=device,
                               gated=args['gated'],
                               ref_df=feat_df,
                               cat_features=cat_feat).to(device)
        
        lr = args['lr'] * np.sqrt(args['batch_size']/1024)  # 0.00075
        optimizer = optim.Adam(model.parameters(), lr=lr,
                               weight_decay=args['wd'])
        lr_scheduler = MultiStepLR(optimizer=optimizer, milestones=[
                                   4000, 12000], gamma=0.3)

        earlystoper = early_stopper(
            patience=args['early_stopping'], verbose=True)
        start_epoch, max_epochs = 0, 2000
        for epoch in range(start_epoch, args['max_epochs']):
            train_loss_list = []
            # train_acc_list = []
            model.train()
            for step, (input_nodes, seeds, blocks) in enumerate(train_dataloader):
                batch_inputs, batch_work_inputs, batch_labels, lpa_labels = load_lpa_subtensor(num_feat, cat_feat, labels,
                                                                                               seeds, input_nodes, device)
                # (|input|, feat_dim); null; (|batch|,); (|input|,)
                blocks = [block.to(device) for block in blocks]
                train_batch_logits = model(
                    blocks, batch_inputs, lpa_labels, batch_work_inputs)
                mask = batch_labels == 2
                train_batch_logits = train_batch_logits[~mask]
                batch_labels = batch_labels[~mask]
                # batch_labels[mask] = 0

                train_loss = loss_fn(train_batch_logits, batch_labels)
                # backward
                optimizer.zero_grad()
                train_loss.backward()
                optimizer.step()
                lr_scheduler.step()
                train_loss_list.append(train_loss.cpu().detach().numpy())

                if step % 10 == 0:
                    tr_batch_pred = torch.sum(torch.argmax(train_batch_logits.clone(
                    ).detach(), dim=1) == batch_labels) / batch_labels.shape[0]
                    score = torch.softmax(train_batch_logits.clone().detach(), dim=1)[
                        :, 1].cpu().numpy()

                    # if (len(np.unique(score)) == 1):
                    #     print("all same prediction!")
                    try:
                        print('In epoch:{:03d}|batch:{:04d}, train_loss:{:4f}, '
                              'train_ap:{:.4f}, train_acc:{:.4f}, train_auc:{:.4f}'.format(epoch, step,
                                                                                           np.mean(
                                                                                               train_loss_list),
                                                                                           average_precision_score(
                                                                                               batch_labels.cpu().numpy(), score),
                                                                                           tr_batch_pred.detach(),
                                                                                           roc_auc_score(batch_labels.cpu().numpy(), score)))
                    except:
                        pass

            # mini-batch for validation
            val_loss_list = 0
            val_acc_list = 0
            val_all_list = 0
            model.eval()
            with torch.no_grad():
                for step, (input_nodes, seeds, blocks) in enumerate(val_dataloader):
                    batch_inputs, batch_work_inputs, batch_labels, lpa_labels = load_lpa_subtensor(num_feat, cat_feat, labels,
                                                                                                   seeds, input_nodes, device)

                    blocks = [block.to(device) for block in blocks]
                    val_batch_logits = model(
                        blocks, batch_inputs, lpa_labels, batch_work_inputs)
                    oof_predictions[seeds] = val_batch_logits
                    mask = batch_labels == 2
                    val_batch_logits = val_batch_logits[~mask]
                    batch_labels = batch_labels[~mask]
                    # batch_labels[mask] = 0
                    val_loss_list = val_loss_list + \
                        loss_fn(val_batch_logits, batch_labels)
                    # val_all_list += 1
                    val_batch_pred = torch.sum(torch.argmax(
                        val_batch_logits, dim=1) == batch_labels) / torch.tensor(batch_labels.shape[0])
                    val_acc_list = val_acc_list + val_batch_pred * \
                        torch.tensor(batch_labels.shape[0])
                    val_all_list = val_all_list + batch_labels.shape[0]
                    if step % 10 == 0:
                        score = torch.softmax(val_batch_logits.clone().detach(), dim=1)[
                            :, 1].cpu().numpy()
                        try:
                            print('In epoch:{:03d}|batch:{:04d}, val_loss:{:4f}, val_ap:{:.4f}, '
                                  'val_acc:{:.4f}, val_auc:{:.4f}'.format(epoch,
                                                                          step,
                                                                          val_loss_list/val_all_list,
                                                                          average_precision_score(
                                                                              batch_labels.cpu().numpy(), score),
                                                                          val_batch_pred.detach(),
                                                                          roc_auc_score(batch_labels.cpu().numpy(), score)))
                        except:
                            pass

            # val_acc_list/val_all_list, model)
            earlystoper.earlystop(val_loss_list/val_all_list, model)
            if earlystoper.is_earlystop:
                print("Early Stopping!")
                break
        print("Best val_loss is: {:.7f}".format(earlystoper.best_cv))
        test_ind = torch.from_numpy(np.array(test_idx)).long().to(device)
        test_sampler = MultiLayerFullNeighborSampler(args['n_layers'])
        test_dataloader = NodeDataLoader(graph,
                                         test_ind,
                                         test_sampler,
                                         use_ddp=False,
                                         device=device,
                                         batch_size=args['batch_size'],
                                         shuffle=True,
                                         drop_last=False,
                                         num_workers=0,
                                         )
        b_model = earlystoper.best_model.to(device)
        b_model.eval()
        with torch.no_grad():
            for step, (input_nodes, seeds, blocks) in enumerate(test_dataloader):
                # print(input_nodes)
                batch_inputs, batch_work_inputs, batch_labels, lpa_labels = load_lpa_subtensor(num_feat, cat_feat, labels,
                                                                                               seeds, input_nodes, device)

                blocks = [block.to(device) for block in blocks]
                test_batch_logits = b_model(
                    blocks, batch_inputs, lpa_labels, batch_work_inputs)
                test_predictions[seeds] = test_batch_logits
                test_batch_pred = torch.sum(torch.argmax(
                    test_batch_logits, dim=1) == batch_labels) / torch.tensor(batch_labels.shape[0])
                if step % 10 == 0:
                    print('In test batch:{:04d}'.format(step))
    mask = y_target == 2
    y_target[mask] = 0
    my_ap = average_precision_score(y_target, torch.softmax(
        oof_predictions, dim=1).cpu()[train_idx, 1])
    print("NN out of fold AP is:", my_ap)
    b_models, val_gnn_0, test_gnn_0 = earlystoper.best_model.to(
        'cpu'), oof_predictions, test_predictions

    test_score = torch.softmax(test_gnn_0, dim=1)[test_idx, 1].cpu().numpy()
    y_target = labels[test_idx].cpu().numpy()
    test_score1 = torch.argmax(test_gnn_0, dim=1)[test_idx].cpu().numpy()

    mask = y_target != 2
    test_score = test_score[mask]
    y_target = y_target[mask]
    test_score1 = test_score1[mask]

    print("test AUC:", roc_auc_score(y_target, test_score))
    print("test f1:", f1_score(y_target, test_score1, average="macro"))
    print("test AP:", average_precision_score(y_target, test_score))


def load_gtan_data(dataset: str, test_size: float):
    """
    Load graph, feature, and label given dataset name
    :param dataset: the dataset name
    :param test_size: the size of test set
    :returns: feature, label, graph, category features
    """
    # prefix = './antifraud/data/'
    prefix = os.path.join(os.path.dirname(__file__), "..", "..", "data/")
    if dataset == "S-FFSD":
        cat_features = ["Target", "Location", "Type"]

        df = pd.read_csv(prefix + "S-FFSDneofull.csv")
        df = df.loc[:, ~df.columns.str.contains('Unnamed')]
        data = df[df["Labels"] <= 2]
        data = data.reset_index(drop=True)
        out = []
        alls = []
        allt = []
        pair = ["Source", "Target", "Location", "Type"]
        for column in pair:
            src, tgt = [], []
            edge_per_trans = 3
            for c_id, c_df in data.groupby(column):
                c_df = c_df.sort_values(by="Time")
                df_len = len(c_df)
                sorted_idxs = c_df.index
                src.extend([sorted_idxs[i] for i in range(df_len)
                            for j in range(edge_per_trans) if i + j < df_len])
                tgt.extend([sorted_idxs[i+j] for i in range(df_len)
                            for j in range(edge_per_trans) if i + j < df_len])
            alls.extend(src)
            allt.extend(tgt)
        alls = np.array(alls)
        allt = np.array(allt)
        g = dgl.graph((alls, allt))

      
        cal_list = ["Source", "Target", "Location", "Type"]
        for col in cal_list:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].apply(str).values)
        feat_data = data.drop("Labels", axis=1)
      
        labels = data["Labels"]
        ###
        feat_data.to_csv(prefix + "S-FFSD_feat_data.csv", index=None)
        labels.to_csv(prefix + "S-FFSD_label_data.csv", index=None)
        ###
        index = list(range(len(labels)))
        g.ndata['label'] = torch.from_numpy(
            labels.to_numpy()).to(torch.long)
        g.ndata['feat'] = torch.from_numpy(
            feat_data.to_numpy()).to(torch.float32)
        graph_path = prefix+"graph-{}.bin".format(dataset)
        dgl.data.utils.save_graphs(graph_path, [g])

        train_idx, test_idx, y_train, y_test = train_test_split(index, labels, stratify=labels, test_size=test_size/2,
                                                                random_state=2, shuffle=True)





    return feat_data, labels, train_idx, test_idx, g, cat_features

In [None]:
import os
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from config import Config
from feature_engineering.data_engineering import data_engineer_benchmark, span_data_2d, span_data_3d
import logging
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
import pickle
import dgl
from scipy.io import loadmat
import yaml

logger = logging.getLogger(__name__)
# sys.path.append("..")


def parse_args():
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter,
                            conflict_handler='resolve')
    parser.add_argument("--method", default='gtan')  # specify which method to use
    method = vars(parser.parse_args())['method']  # dict

    # if method in ['']:
    #     yaml_file = "config/base_cfg.yaml"
    if method in ['mcnn']:
        yaml_file = "config/mcnn_cfg.yaml"
    elif method in ['stan']:
        yaml_file = "config/stan_cfg.yaml"
    elif method in ['stan_2d']:
        yaml_file = "config/stan_2d_cfg.yaml"
    elif method in ['stagn']:
        yaml_file = "config/stagn_cfg.yaml"
    elif method in ['gtan']:
        print("gtan was chosen")
        yaml_file = "gtan_cfg.yaml"
    elif method in ['rgtan']:
        yaml_file = "config/rgtan_cfg.yaml"
    elif method in ['hogrl']:
        yaml_file = "config/hogrl_cfg.yaml"
        
    else:
        raise NotImplementedError("Unsupported method.")

    # config = Config().get_config()
    with open(yaml_file) as file:
        args = yaml.safe_load(file)
    args['method'] = method
    return args


def base_load_data(args: dict):
    # load S-FFSD dataset for base models
    data_path = "data/S-FFSD.csv"
    feat_df = pd.read_csv(data_path)
    train_size = 1 - args['test_size']
    method = args['method']
    # for ICONIP16 & AAAI20
    if args['method'] == 'stan':
        if os.path.exists("data/tel_3d.npy"):
            return
        features, labels = span_data_3d(feat_df)
    else:
        if os.path.exists("data/tel_2d.npy"):
            return
        features, labels = span_data_2d(feat_df)
    num_trans = len(feat_df)
    trf, tef, trl, tel = train_test_split(
        features, labels, train_size=train_size, stratify=labels, shuffle=True)
    trf_file, tef_file, trl_file, tel_file = args['trainfeature'], args[
        'testfeature'], args['trainlabel'], args['testlabel']
    np.save(trf_file, trf)
    np.save(tef_file, tef)
    np.save(trl_file, trl)
    np.save(tel_file, tel)
    return


def main(args):
    if args['method'] == 'mcnn':
        pass
    elif args['method'] == 'stan_2d':
        pass
    elif args['method'] == 'stan':
       pass
    elif args['method'] == 'stagn':
       pass
    elif args['method'] == 'gtan':
        from methods.gtan.gtan_main import gtan_main, load_gtan_data
        feat_data, labels, train_idx, test_idx, g, cat_features = load_gtan_data(
            args['dataset'], args['test_size'])
        # feat_data用于存储数据集的特征数据，通常是一个矩阵或数组，每一行代表一个样本的特征向量。
        # labels 标签
        # train_idx, 
        # test_idx,
        # g, 
        # cat_features 类别特征
        gtan_main(
            feat_data, g, train_idx, test_idx, labels, args, cat_features)


if __name__ == "__main__":
    main(parse_args())


In [None]:
%tb

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from dgl.utils import expand_as_pair
from dgl import function as fn
from dgl.base import DGLError
from dgl.nn.functional import edge_softmax
import numpy as np
cat_features = ["Target",
                "Type",
                "Location"]


class PosEncoding(nn.Module):

    def __init__(self, dim, device, base=10000, bias=0):

        super(PosEncoding, self).__init__()
        """
        Initialize the posencoding component
        :param dim: the encoding dimension 
		:param device: where to train model
		:param base: the encoding base
		:param bias: the encoding bias
        """
        p = []
        sft = []
        for i in range(dim):
            b = (i - i % 2) / dim
            p.append(base ** -b)
            if i % 2:
                sft.append(np.pi / 2.0 + bias)
            else:
                sft.append(bias)
        self.device = device
        self.sft = torch.tensor(
            sft, dtype=torch.float32).view(1, -1).to(device)
        self.base = torch.tensor(p, dtype=torch.float32).view(1, -1).to(device)

    def forward(self, pos):
        with torch.no_grad():
            if isinstance(pos, list):
                pos = torch.tensor(pos, dtype=torch.float32).to(self.device)
            pos = pos.view(-1, 1)
            x = pos / self.base + self.sft
            return torch.sin(x)






class TransEmbedding(nn.Module):

    def __init__(self, df=None, device='cpu', dropout=0.2, in_feats=82, cat_features=None):
        """
        Initialize the attribute embedding and feature learning compoent

        :param df: the feature
                :param device: where to train model
                :param dropout: the dropout rate
                :param in_feat: the shape of input feature in dimension 1
                :param cat_feature: category features
        """
        super(TransEmbedding, self).__init__()
        self.time_pe = PosEncoding(dim=in_feats, device=device, base=100)
        #time_emb = time_pe(torch.sin(torch.tensor(df['time_span'].values)/86400*torch.pi))
        self.cat_table = nn.ModuleDict({col: nn.Embedding(max(df[col].unique(
        ))+1, in_feats).to(device) for col in cat_features if col not in {"Labels", "Time"}})
        self.label_table = nn.Embedding(3, in_feats, padding_idx=2).to(device)
        self.time_emb = None
        self.emb_dict = None
        self.label_emb = None
        self.cat_features = cat_features
        self.forward_mlp = nn.ModuleList(
            [nn.Linear(in_feats, in_feats) for i in range(len(cat_features))])
        self.dropout = nn.Dropout(dropout)

    def forward_emb(self, df):
        if self.emb_dict is None:
            self.emb_dict = self.cat_table
        # print(self.emb_dict)
        # print(df['trans_md'])
        support = {col: self.emb_dict[col](
            df[col]) for col in self.cat_features if col not in {"Labels", "Time"}}
        #self.time_emb = self.time_pe(torch.sin(torch.tensor(df['time_span'])/86400*torch.pi))
        #support['time_span'] = self.time_emb
        #support['labels'] = self.label_table(df['labels'])
        return support

    def forward(self, df):
        support = self.forward_emb(df)
        output = 0
        for i, k in enumerate(support.keys()):
            # if k =='time_span':
            #    print(df[k].shape)
            support[k] = self.dropout(support[k])
            support[k] = self.forward_mlp[i](support[k])
            output = output + support[k]
        return output


class TransformerConv(nn.Module):

    def __init__(self,
                 in_feats,
                 out_feats,
                 num_heads,
                 bias=True,
                 allow_zero_in_degree=False,
                 # feat_drop=0.6,
                 # attn_drop=0.6,
                 skip_feat=True,
                 gated=True,
                 layer_norm=True,
                 activation=nn.PReLU()):
        """
        Initialize the transformer layer.
        Attentional weights are jointly optimized in an end-to-end mechanism with graph neural networks and fraud detection networks.
            :param in_feat: the shape of input feature
            :param out_feats: the shape of output feature
            :param num_heads: the number of multi-head attention 
            :param bias: whether to use bias
            :param allow_zero_in_degree: whether to allow zero in degree
            :param skip_feat: whether to skip some feature 
            :param gated: whether to use gate
            :param layer_norm: whether to use layer regularization
            :param activation: the type of activation function   
        """

        super(TransformerConv, self).__init__()
        self._in_src_feats, self._in_dst_feats = expand_as_pair(in_feats)
        self._out_feats = out_feats
        self._allow_zero_in_degree = allow_zero_in_degree
        self._num_heads = num_heads

        self.lin_query = nn.Linear(
            self._in_src_feats, self._out_feats*self._num_heads, bias=bias)
        self.lin_key = nn.Linear(
            self._in_src_feats, self._out_feats*self._num_heads, bias=bias)
        self.lin_value = nn.Linear(
            self._in_src_feats, self._out_feats*self._num_heads, bias=bias)

        #self.feat_dropout = nn.Dropout(p=feat_drop)
        #self.attn_dropout = nn.Dropout(p=attn_drop)
        if skip_feat:
            self.skip_feat = nn.Linear(
                self._in_src_feats, self._out_feats*self._num_heads, bias=bias)
        else:
            self.skip_feat = None
        if gated:
            self.gate = nn.Linear(
                3*self._out_feats*self._num_heads, 1, bias=bias)
        else:
            self.gate = None
        if layer_norm:
            self.layer_norm = nn.LayerNorm(self._out_feats*self._num_heads)
        else:
            self.layer_norm = None
        self.activation = activation

    def forward(self, graph, feat, get_attention=False):
        """
        Description: Transformer Graph Convolution
        :param graph: input graph
            :param feat: input feat
            :param get_attention: whether to get attention
        """

        graph = graph.local_var()

        if not self._allow_zero_in_degree:
            if (graph.in_degrees() == 0).any():
                raise DGLError('There are 0-in-degree nodes in the graph, '
                               'output for those nodes will be invalid. '
                               'This is harmful for some applications, '
                               'causing silent performance regression. '
                               'Adding self-loop on the input graph by '
                               'calling `g = dgl.add_self_loop(g)` will resolve '
                               'the issue. Setting ``allow_zero_in_degree`` '
                               'to be `True` when constructing this module will '
                               'suppress the check and let the code run.')

        # check if feat is a tuple
        if isinstance(feat, tuple):
            h_src = feat[0]
            h_dst = feat[1]
        else:
            h_src = feat
            h_dst = h_src[:graph.number_of_dst_nodes()]

        # Step 0. q, k, v
        q_src = self.lin_query(
            h_src).view(-1, self._num_heads, self._out_feats)
        k_dst = self.lin_key(h_dst).view(-1, self._num_heads, self._out_feats)
        v_src = self.lin_value(
            h_src).view(-1, self._num_heads, self._out_feats)
        # Assign features to nodes
        graph.srcdata.update({'ft': q_src, 'ft_v': v_src})
        graph.dstdata.update({'ft': k_dst})
        # Step 1. dot product
        graph.apply_edges(fn.u_dot_v('ft', 'ft', 'a'))

        # Step 2. edge softmax to compute attention scores
        graph.edata['sa'] = edge_softmax(
            graph, graph.edata['a'] / self._out_feats**0.5)

        # Step 3. Broadcast softmax value to each edge, and aggregate dst node
        graph.update_all(fn.u_mul_e('ft_v', 'sa', 'attn'),
                         fn.sum('attn', 'agg_u'))

        # output results to the destination nodes
        rst = graph.dstdata['agg_u'].reshape(-1,
                                             self._out_feats*self._num_heads)

        if self.skip_feat is not None:
            skip_feat = self.skip_feat(feat[:graph.number_of_dst_nodes()])
            if self.gate is not None:
                gate = torch.sigmoid(
                    self.gate(
                        torch.concat([skip_feat, rst, skip_feat - rst], dim=-1)))
                rst = gate * skip_feat + (1 - gate) * rst
            else:
                rst = skip_feat + rst

        if self.layer_norm is not None:
            rst = self.layer_norm(rst)

        if self.activation is not None:
            rst = self.activation(rst)

        if get_attention:
            return rst, graph.edata['sa']
        else:
            return rst


class GraphAttnModel(nn.Module):
    def __init__(self,
                 in_feats,
                 hidden_dim,
                 n_layers,
                 n_classes,
                 heads,
                 activation,
                 skip_feat=True,
                 gated=True,
                 layer_norm=True,
                 post_proc=True,
                 n2v_feat=True,
                 drop=None,
                 ref_df=None,
                 cat_features=None,
                 nei_features=None,
                 device='cpu'):
        """
        Initialize the GTAN-GNN model 
        :param in_feats: the shape of input feature
                :param hidden_dim: model hidden layer dimension
                :param n_layers: the number of GTAN layers
                :param n_classes: the number of classification
                :param heads: the number of multi-head attention 
                :param activation: the type of activation function
                :param skip_feat: whether to skip some feature
                :param gated: whether to use gate
        :param layer_norm: whether to use layer regularization
                :param post_proc: whether to use post processing
                :param n2v_feat: whether to use n2v features
        :param drop: whether to use drop
                :param ref_df: whether to refer other node features
                :param cat_features: category features
                :param nei_features: neighborhood statistic features
        :param device: where to train model
        """

        super(GraphAttnModel, self).__init__()
        self.in_feats = in_feats
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.n_classes = n_classes
        self.heads = heads
        self.activation = activation
        #self.input_drop = lambda x: x
        self.input_drop = nn.Dropout(drop[0])
        self.drop = drop[1]
        self.output_drop = nn.Dropout(self.drop)
        # self.pn = PairNorm(mode=pairnorm)
        if n2v_feat:
            self.n2v_mlp = TransEmbedding(
                ref_df, device=device, in_feats=in_feats, cat_features=cat_features)
        else:
            self.n2v_mlp = lambda x: x
        self.layers = nn.ModuleList()
        self.layers.append(nn.Embedding(
            n_classes+1, in_feats, padding_idx=n_classes))
        self.layers.append(
            nn.Linear(self.in_feats, self.hidden_dim*self.heads[0]))
        self.layers.append(
            nn.Linear(self.in_feats, self.hidden_dim*self.heads[0]))
        self.layers.append(nn.Sequential(nn.BatchNorm1d(self.hidden_dim*self.heads[0]),
                                         nn.PReLU(),
                                         nn.Dropout(self.drop),
                                         nn.Linear(self.hidden_dim *
                                                   self.heads[0], in_feats)
                                         ))

        # build multiple layers
        self.layers.append(TransformerConv(in_feats=self.in_feats,
                                           out_feats=self.hidden_dim,
                                           num_heads=self.heads[0],
                                           skip_feat=skip_feat,
                                           gated=gated,
                                           layer_norm=layer_norm,
                                           activation=self.activation))

        for l in range(0, (self.n_layers - 1)):
            # due to multi-head, the in_dim = num_hidden * num_heads
            self.layers.append(TransformerConv(in_feats=self.hidden_dim * self.heads[l - 1],
                                               out_feats=self.hidden_dim,
                                               num_heads=self.heads[l],
                                               skip_feat=skip_feat,
                                               gated=gated,
                                               layer_norm=layer_norm,
                                               activation=self.activation))
        if post_proc:
            self.layers.append(nn.Sequential(nn.Linear(self.hidden_dim * self.heads[-1], self.hidden_dim * self.heads[-1]),
                                             nn.BatchNorm1d(
                                                 self.hidden_dim * self.heads[-1]),
                                             nn.PReLU(),
                                             nn.Dropout(self.drop),
                                             nn.Linear(self.hidden_dim * self.heads[-1], self.n_classes)))
        else:
            self.layers.append(nn.Linear(self.hidden_dim *
                               self.heads[-1], self.n_classes))

    def forward(self, blocks, features, labels, n2v_feat=None):
        """
        :param blocks: train blocks
        :param features: train features  (|input|, feta_dim)
        :param labels: train labels (|input|, )
        :param n2v_feat: whether to use n2v features 
        """

        if n2v_feat is None:
            h = features
        else:
            h = self.n2v_mlp(n2v_feat)
            h = features + h

        label_embed = self.input_drop(self.layers[0](labels))
        label_embed = self.layers[1](h) + self.layers[2](label_embed)
        label_embed = self.layers[3](label_embed)
        h = h + label_embed  # residual

        for l in range(self.n_layers):
            h = self.output_drop(self.layers[l+4](blocks[l], h))

        logits = self.layers[-1](h)

        return logits


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import dgl
import pandas as pd

# 模拟数据
# 图数据
g = dgl.rand_graph(100, 200)  # 生成一个包含100个节点和200条边的随机图
# 节点特征
num_features = 82
features = torch.randn(g.num_nodes(), num_features)
# 节点标签
num_classes = 3
labels = torch.randint(0, num_classes, (g.num_nodes(),))
# 模拟参考数据
ref_df = pd.DataFrame({
    "Target": torch.randint(0, 5, (g.num_nodes(),)).tolist(),
    "Type": torch.randint(0, 3, (g.num_nodes(),)).tolist(),
    "Location": torch.randint(0, 4, (g.num_nodes(),)).tolist()
})

# 定义模型参数
in_feats = num_features
hidden_dim = 64
n_layers = 2
heads = [4, 4]
activation = nn.PReLU()
skip_feat = True
gated = True
layer_norm = True
post_proc = True
n2v_feat = True
drop = [0.2, 0.2]
device = 'cpu'

# 创建模型实例
model = GraphAttnModel(
    in_feats=in_feats,
    hidden_dim=hidden_dim,
    n_layers=n_layers,
    n_classes=num_classes,
    heads=heads,
    activation=activation,
    skip_feat=skip_feat,
    gated=gated,
    layer_norm=layer_norm,
    post_proc=post_proc,
    n2v_feat=n2v_feat,
    drop=drop,
    ref_df=ref_df,
    cat_features=cat_features,
    device=device
)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
    # 前向传播
    logits = model([g], features, labels, ref_df)
    loss = criterion(logits, labels)

    # 反向传播和优化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

# 推理
with torch.no_grad():
    test_logits = model([g], features, labels, ref_df)
    predicted_labels = torch.argmax(test_logits, dim=1)
    print("Predicted labels:", predicted_labels)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import dgl
import pandas as pd

# 模拟数据
# 图数据
g = dgl.rand_graph(100, 200)  # 生成一个包含100个节点和200条边的随机图
# 节点特征
num_features = 82
features = torch.randn(g.num_nodes(), num_features)
# 节点标签
num_classes = 3
labels = torch.randint(0, num_classes, (g.num_nodes(),))
# 模拟参考数据
ref_df = pd.DataFrame({
    "Target": torch.randint(0, 5, (g.num_nodes(),)).tolist(),
    "Type": torch.randint(0, 3, (g.num_nodes(),)).tolist(),
    "Location": torch.randint(0, 4, (g.num_nodes(),)).tolist()
})

# 定义模型参数
in_feats = num_features
hidden_dim = 64
n_layers = 2
heads = [4, 4]
activation = nn.PReLU()
skip_feat = True
gated = True
layer_norm = True
post_proc = True
n2v_feat = True
drop = [0.2, 0.2]
device = 'cpu'

# 创建模型实例
model = GraphAttnModel(
    in_feats=in_feats,
    hidden_dim=hidden_dim,
    n_layers=n_layers,
    n_classes=num_classes,
    heads=heads,
    activation=activation,
    skip_feat=skip_feat,
    gated=gated,
    layer_norm=layer_norm,
    post_proc=post_proc,
    n2v_feat=n2v_feat,
    drop=drop,
    ref_df=ref_df,
    cat_features=cat_features,
    device=device
)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
    # 前向传播
    logits = model([g], features, labels, ref_df)
    loss = criterion(logits, labels)

    # 反向传播和优化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

# 推理
with torch.no_grad():
    test_logits = model([g], features, labels, ref_df)
    predicted_labels = torch.argmax(test_logits, dim=1)
    print("Predicted labels:", predicted_labels)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import dgl
import pandas as pd
import numpy as np


cat_features = ["Target",
                "Type",
                "Location"]


class PosEncoding(nn.Module):

    def __init__(self, dim, device, base=10000, bias=0):

        super(PosEncoding, self).__init__()
        """
        Initialize the posencoding component
        :param dim: the encoding dimension 
        :param device: where to train model
        :param base: the encoding base
        :param bias: the encoding bias
        """
        p = []
        sft = []
        for i in range(dim):
            b = (i - i % 2) / dim
            p.append(base ** -b)
            if i % 2:
                sft.append(np.pi / 2.0 + bias)
            else:
                sft.append(bias)
        self.device = device
        self.sft = torch.tensor(
            sft, dtype=torch.float32).view(1, -1).to(device)
        self.base = torch.tensor(p, dtype=torch.float32).view(1, -1).to(device)

    def forward(self, pos):
        with torch.no_grad():
            if isinstance(pos, list):
                pos = torch.tensor(pos, dtype=torch.float32).to(self.device)
            pos = pos.view(-1, 1)
            x = pos / self.base + self.sft
            return torch.sin(x)


class TransEmbedding(nn.Module):

    def __init__(self, df=None, device='cpu', dropout=0.2, in_feats=82, cat_features=None):
        """
        Initialize the attribute embedding and feature learning compoent

        :param df: the feature
                :param device: where to train model
                :param dropout: the dropout rate
                :param in_feat: the shape of input feature in dimension 1
                :param cat_feature: category features
        """
        super(TransEmbedding, self).__init__()
        self.time_pe = PosEncoding(dim=in_feats, device=device, base=100)
        # time_emb = time_pe(torch.sin(torch.tensor(df['time_span'].values)/86400*torch.pi))
        self.cat_table = nn.ModuleDict({col: nn.Embedding(max(df[col].unique(
        ))+1, in_feats).to(device) for col in cat_features if col not in {"Labels", "Time"}})
        self.label_table = nn.Embedding(3, in_feats, padding_idx=2).to(device)
        self.time_emb = None
        self.emb_dict = None
        self.label_emb = None
        self.cat_features = cat_features
        self.forward_mlp = nn.ModuleList(
            [nn.Linear(in_feats, in_feats) for i in range(len(cat_features))])
        self.dropout = nn.Dropout(dropout)

    def forward_emb(self, df):
        if self.emb_dict is None:
            self.emb_dict = self.cat_table
        # Convert pandas.Series to torch.Tensor
        support = {col: self.emb_dict[col](torch.tensor(df[col].values, dtype=torch.long).to(self.emb_dict[col].weight.device))
                   for col in self.cat_features if col not in {"Labels", "Time"}}
        # self.time_emb = self.time_pe(torch.sin(torch.tensor(df['time_span'])/86400*torch.pi))
        # support['time_span'] = self.time_emb
        # support['labels'] = self.label_table(df['labels'])
        return support

    def forward(self, df):
        support = self.forward_emb(df)
        output = 0
        for i, k in enumerate(support.keys()):
            # if k =='time_span':
            #    print(df[k].shape)
            support[k] = self.dropout(support[k])
            support[k] = self.forward_mlp[i](support[k])
            output = output + support[k]
        return output


class TransformerConv(nn.Module):

    def __init__(self,
                 in_feats,
                 out_feats,
                 num_heads,
                 bias=True,
                 allow_zero_in_degree=False,
                 # feat_drop=0.6,
                 # attn_drop=0.6,
                 skip_feat=True,
                 gated=True,
                 layer_norm=True,
                 activation=nn.PReLU()):
        """
        Initialize the transformer layer.
        Attentional weights are jointly optimized in an end-to-end mechanism with graph neural networks and fraud detection networks.
            :param in_feat: the shape of input feature
            :param out_feats: the shape of output feature
            :param num_heads: the number of multi-head attention 
            :param bias: whether to use bias
            :param allow_zero_in_degree: whether to allow zero in degree
            :param skip_feat: whether to skip some feature 
            :param gated: whether to use gate
            :param layer_norm: whether to use layer regularization
            :param activation: the type of activation function   
        """

        super(TransformerConv, self).__init__()
        self._in_src_feats, self._in_dst_feats = expand_as_pair(in_feats)
        self._out_feats = out_feats
        self._allow_zero_in_degree = allow_zero_in_degree
        self._num_heads = num_heads

        self.lin_query = nn.Linear(
            self._in_src_feats, self._out_feats*self._num_heads, bias=bias)
        self.lin_key = nn.Linear(
            self._in_src_feats, self._out_feats*self._num_heads, bias=bias)
        self.lin_value = nn.Linear(
            self._in_src_feats, self._out_feats*self._num_heads, bias=bias)

        # self.feat_dropout = nn.Dropout(p=feat_drop)
        # self.attn_dropout = nn.Dropout(p=attn_drop)
        if skip_feat:
            self.skip_feat = nn.Linear(
                self._in_src_feats, self._out_feats*self._num_heads, bias=bias)
        else:
            self.skip_feat = None
        if gated:
            self.gate = nn.Linear(
                3*self._out_feats*self._num_heads, 1, bias=bias)
        else:
            self.gate = None
        if layer_norm:
            self.layer_norm = nn.LayerNorm(self._out_feats*self._num_heads)
        else:
            self.layer_norm = None
        self.activation = activation

    def forward(self, graph, feat, get_attention=False):
        """
        Description: Transformer Graph Convolution
        :param graph: input graph
            :param feat: input feat
            :param get_attention: whether to get attention
        """

        graph = graph.local_var()

        if not self._allow_zero_in_degree:
            if (graph.in_degrees() == 0).any():
                raise DGLError('There are 0-in-degree nodes in the graph, '
                               'output for those nodes will be invalid. '
                               'This is harmful for some applications, '
                               'causing silent performance regression. '
                               'Adding self-loop on the input graph by '
                               'calling `g = dgl.add_self_loop(g)` will resolve '
                               'the issue. Setting ``allow_zero_in_degree`` '
                               'to be `True` when constructing this module will '
                               'suppress the check and let the code run.')

        # check if feat is a tuple
        if isinstance(feat, tuple):
            h_src = feat[0]
            h_dst = feat[1]
        else:
            h_src = feat
            h_dst = h_src[:graph.number_of_dst_nodes()]

        # Step 0. q, k, v
        q_src = self.lin_query(
            h_src).view(-1, self._num_heads, self._out_feats)
        k_dst = self.lin_key(h_dst).view(-1, self._num_heads, self._out_feats)
        v_src = self.lin_value(
            h_src).view(-1, self._num_heads, self._out_feats)
        # Assign features to nodes
        graph.srcdata.update({'ft': q_src, 'ft_v': v_src})
        graph.dstdata.update({'ft': k_dst})
        # Step 1. dot product
        graph.apply_edges(fn.u_dot_v('ft', 'ft', 'a'))

        # Step 2. edge softmax to compute attention scores
        graph.edata['sa'] = edge_softmax(
            graph, graph.edata['a'] / self._out_feats**0.5)

        # Step 3. Broadcast softmax value to each edge, and aggregate dst node
        graph.update_all(fn.u_mul_e('ft_v', 'sa', 'attn'),
                         fn.sum('attn', 'agg_u'))

        # output results to the destination nodes
        rst = graph.dstdata['agg_u'].reshape(-1,
                                             self._out_feats*self._num_heads)

        if self.skip_feat is not None:
            skip_feat = self.skip_feat(feat[:graph.number_of_dst_nodes()])
            if self.gate is not None:
                gate = torch.sigmoid(
                    self.gate(
                        torch.concat([skip_feat, rst, skip_feat - rst], dim=-1)))
                rst = gate * skip_feat + (1 - gate) * rst
            else:
                rst = skip_feat + rst

        if self.layer_norm is not None:
            rst = self.layer_norm(rst)

        if self.activation is not None:
            rst = self.activation(rst)

        if get_attention:
            return rst, graph.edata['sa']
        else:
            return rst


class GraphAttnModel(nn.Module):
    def __init__(self,
                 in_feats,
                 hidden_dim,
                 n_layers,
                 n_classes,
                 heads,
                 activation,
                 skip_feat=True,
                 gated=True,
                 layer_norm=True,
                 post_proc=True,
                 n2v_feat=True,
                 drop=None,
                 ref_df=None,
                 cat_features=None,
                 nei_features=None,
                 device='cpu'):
        """
        Initialize the GTAN-GNN model 
        :param in_feats: the shape of input feature
                :param hidden_dim: model hidden layer dimension
                :param n_layers: the number of GTAN layers
                :param n_classes: the number of classification
                :param heads: the number of multi-head attention 
                :param activation: the type of activation function
                :param skip_feat: whether to skip some feature
                :param gated: whether to use gate
        :param layer_norm: whether to use layer regularization
                :param post_proc: whether to use post processing
                :param n2v_feat: whether to use n2v features
        :param drop: whether to use drop
                :param ref_df: whether to refer other node features
                :param cat_features: category features
                :param nei_features: neighborhood statistic features
        :param device: where to train model
        """

        super(GraphAttnModel, self).__init__()
        self.in_feats = in_feats
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.n_classes = n_classes
        self.heads = heads
        self.activation = activation
        # self.input_drop = lambda x: x
        self.input_drop = nn.Dropout(drop[0])
        self.drop = drop[1]
        self.output_drop = nn.Dropout(self.drop)
        # self.pn = PairNorm(mode=pairnorm)
        if n2v_feat:
            self.n2v_mlp = TransEmbedding(
                ref_df, device=device, in_feats=in_feats, cat_features=cat_features)
        else:
            self.n2v_mlp = lambda x: x
        self.layers = nn.ModuleList()
        self.layers.append(nn.Embedding(
            n_classes+1, in_feats, padding_idx=n_classes))
        self.layers.append(
            nn.Linear(self.in_feats, self.hidden_dim*self.heads[0]))
        self.layers.append(
            nn.Linear(self.in_feats, self.hidden_dim*self.heads[0]))
        self.layers.append(nn.Sequential(nn.BatchNorm1d(self.hidden_dim*self.heads[0]),
                                         nn.PReLU(),
                                         nn.Dropout(self.drop),
                                         nn.Linear(self.hidden_dim *
                                                   self.heads[0], in_feats)
                                         ))

        # build multiple layers
        self.layers.append(TransformerConv(in_feats=self.in_feats,
                                           out_feats=self.hidden_dim,
                                           num_heads=self.heads[0],
                                           skip_feat=skip_feat,
                                           gated=gated,
                                           layer_norm=layer_norm,
                                           activation=self.activation))

        for l in range(0, (self.n_layers - 1)):
            # due to multi-head, the in_dim = num_hidden * num_heads
            self.layers.append(TransformerConv(in_feats=self.hidden_dim * self.heads[l - 1],
                                               out_feats=self.hidden_dim,
                                               num_heads=self.heads[l],
                                               skip_feat=skip_feat,
                                               gated=gated,
                                               layer_norm=layer_norm,
                                               activation=self.activation))
        if post_proc:
            self.layers.append(nn.Sequential(nn.Linear(self.hidden_dim * self.heads[-1], self.hidden_dim * self.heads[-1]),
                                             nn.BatchNorm1d(
                                                 self.hidden_dim * self.heads[-1]),
                                             nn.PReLU(),
                                             nn.Dropout(self.drop),
                                             nn.Linear(self.hidden_dim * self.heads[-1], self.n_classes)))
        else:
            self.layers.append(nn.Linear(self.hidden_dim *
                               self.heads[-1], self.n_classes))

    def forward(self, blocks, features, labels, n2v_feat=None):
        """
        :param blocks: train blocks
        :param features: train features  (|input|, feta_dim)
        :param labels: train labels (|input|, )
        :param n2v_feat: whether to use n2v features 
        """

        if n2v_feat is None:
            h = features
        else:
            h = self.n2v_mlp(n2v_feat)
            h = features + h

        label_embed = self.input_drop(self.layers[0](labels))
        label_embed = self.layers[1](h) + self.layers[2](label_embed)
        label_embed = self.layers[3](label_embed)
        h = h + label_embed  # residual

        for l in range(self.n_layers):
            h = self.output_drop(self.layers[l+4](blocks[l], h))

        logits = self.layers[-1](h)

        return logits


# 1. 数据准备
# 模拟图数据
num_nodes = 100
num_edges = 200
g = dgl.rand_graph(num_nodes, num_edges)

# 模拟节点特征
in_feats = 82
features = torch.randn(num_nodes, in_feats)

# 模拟节点标签
n_classes = 3
labels = torch.randint(0, n_classes, (num_nodes,))

# 模拟参考数据
ref_df = pd.DataFrame({
    "Target": torch.randint(0, 5, (num_nodes,)).tolist(),
    "Type": torch.randint(0, 3, (num_nodes,)).tolist(),
    "Location": torch.randint(0, 4, (num_nodes,)).tolist()
})

# 2. 模型初始化
hidden_dim = 64
n_layers = 2
heads = [4, 4]
activation = nn.PReLU()
skip_feat = True
gated = True
layer_norm = True
post_proc = True
n2v_feat = True
drop = [0.2, 0.2]
device = 'cpu'

model = GraphAttnModel(
    in_feats=in

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import dgl
import pandas as pd


# 1. 数据准备
# 模拟图数据
num_nodes = 100
num_edges = 200
g = dgl.rand_graph(num_nodes, num_edges)

# 模拟节点特征
in_feats = 82
features = torch.randn(num_nodes, in_feats)

# 模拟节点标签
n_classes = 3
labels = torch.randint(0, n_classes, (num_nodes,))

# 模拟参考数据
ref_df = pd.DataFrame({
    "Target": torch.randint(0, 5, (num_nodes,)).tolist(),
    "Type": torch.randint(0, 3, (num_nodes,)).tolist(),
    "Location": torch.randint(0, 4, (num_nodes,)).tolist()
})

# 将 ref_df 中的分类特征转换为 Tensor
n2v_feat = torch.tensor(ref_df[cat_features].values, dtype=torch.long)

# 2. 模型初始化
hidden_dim = 64
n_layers = 2
heads = [4, 4]
activation = nn.PReLU()
skip_feat = True
gated = True
layer_norm = True
post_proc = True
n2v_feat_flag = True  # 为了避免与 n2v_feat 变量名冲突
drop = [0.2, 0.2]
device = 'cpu'

model = GraphAttnModel(
    in_feats=in_feats,
    hidden_dim=hidden_dim,
    n_layers=n_layers,
    n_classes=n_classes,
    heads=heads,
    activation=activation,
    skip_feat=skip_feat,
    gated=gated,
    layer_norm=layer_norm,
    post_proc=post_proc,
    n2v_feat=n2v_feat_flag,
    drop=drop,
    ref_df=ref_df,
    cat_features=cat_features,
    device=device
)

# 3. 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. 模型训练
num_epochs = 10
for epoch in range(num_epochs):
    # 前向传播
    blocks = [g]  # 这里简化为使用整个图作为一个 block
    logits = model(blocks, features, labels, n2v_feat)
    loss = criterion(logits, labels)

    # 反向传播和优化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

# 5. 模型评估
with torch.no_grad():
    test_logits = model(blocks, features, labels, n2v_feat)
    predicted_labels = torch.argmax(test_logits, dim=1)
    accuracy = (predicted_labels == labels).float().mean()
    print(f'Accuracy: {accuracy.item()}')

In [None]:
import dgl
import torch
import pandas as pd

# ----------------------------
# 模拟图结构 (1000笔交易)
# ----------------------------
num_nodes = 1000
src_nodes = torch.randint(0, num_nodes, (5000,))  # 随机生成5000条交易边
dst_nodes = torch.randint(0, num_nodes, (5000,))
graph = dgl.graph((src_nodes, dst_nodes))

# 构建2层采样子图块
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
blocks = sampler.sample_blocks(graph, torch.tensor([0,1,2]))  # 示例采样

# ----------------------------
# 节点特征 (数值型特征)
# ----------------------------
features = torch.randn(num_nodes, 82)  # 82维原始特征

# ----------------------------
# 标签信息 (0-正常，1-欺诈，2-padding)
# ----------------------------
labels = torch.cat([
    torch.zeros(800),   # 800正常交易
    torch.ones(100),    # 100欺诈交易
    torch.full((100,), 2)  # 100未标注
])

# ----------------------------
# 结构化特征DataFrame
# ----------------------------
n2v_df = pd.DataFrame({
    'Target': np.random.randint(0, 5, num_nodes),  # 5种目标账户类型
    'Type': np.random.randint(0, 3, num_nodes),    # 3种交易类型
    'Location': np.random.randint(0, 10, num_nodes), # 10个地区编码
    'time_span': np.random.uniform(0, 86400, num_nodes)  # 交易时间戳
})


In [None]:
model = GraphAttnModel(
    in_feats=82,
    hidden_dim=128,
    n_layers=2,
    n_classes=2,
    heads=[4,4],  # 每层4个头
    activation=nn.PReLU(),
    cat_features=cat_features,
    device='cuda',
    drop=[0.2,0.3]
)


In [None]:
import dgl
import torch
import numpy as np
import pandas as pd
from torch import nn

# 配置参数
num_nodes = 1000
in_feats = 82
device = 'cuda' if torch.cuda.is_available() else 'cpu'
cat_features = ["Target", "Type", "Location"]

# ----------------------------
# 1. 数据准备（关键修正点）
# ----------------------------
# 生成完整的ref_df包含必须的类别列
ref_df = pd.DataFrame({
    'Target': np.random.randint(0, 5, num_nodes),
    'Type': np.random.randint(0, 3, num_nodes),
    'Location': np.random.randint(0, 10, num_nodes),
    'time_span': np.random.uniform(0, 86400, num_nodes),
    'labels': np.random.randint(0, 3, num_nodes)  # 添加labels列
}).astype({'labels': 'category'})

# ----------------------------
# 2. 构建图结构
# ----------------------------
# 创建随机图
src = torch.randint(0, num_nodes, (5000,))
dst = torch.randint(0, num_nodes, (5000,))
g = dgl.graph((src, dst)).to(device)

# 构建2层采样子图
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
seed_nodes = torch.arange(num_nodes).to(device)
blocks = sampler.sample_blocks(g, seed_nodes)

# ----------------------------
# 3. 初始化模型（关键参数设置）
# ----------------------------
model = GraphAttnModel(
    in_feats=in_feats,
    hidden_dim=64,
    n_layers=2,
    n_classes=2,
    heads=[2, 2],
    activation=nn.PReLU(),
    n2v_feat=True,   # 明确启用n2v特征
    ref_df=ref_df,   # 传递有效DataFrame
    cat_features=cat_features,
    device=device,
    drop=[0.1, 0.2]
).to(device)

# ----------------------------
# 4. 准备输入数据
# ----------------------------
# 特征张量 (模拟数值特征)
features = torch.randn(num_nodes, in_feats).to(device)

# 标签数据 (包含padding索引2)
labels = torch.cat([
    torch.randint(0, 2, (800,)),   # 已标注数据
    torch.full((200,), 2)          # 未标注数据
]).to(device)

# ----------------------------
# 5. 执行前向传播
# ----------------------------
with torch.no_grad():
    logits = model(blocks, features, labels, n2v_feat=ref_df)
    print("输出logits尺寸:", logits.shape)  # 应输出 torch.Size([1000, 2])

# ----------------------------
# 6. 验证关键组件
# ----------------------------
# 检查TransEmbedding输出
trans_emb = model.n2v_mlp
test_df = ref_df.iloc[:5]  # 取前5个样本测试
emb_output = trans_emb(test_df)
print("\n特征增强示例:")
print("输入维度:", test_df.shape)
print("输出维度:", emb_output.shape)  # 应输出 torch.Size([5, 82])

# 检查TransformerConv层
conv_layer = model.layers[4]
h = torch.randn(g.num_nodes(), in_feats).to(device)
out = conv_layer(g, h)
print("\n图注意力层输出尺寸:", out.shape)  # 应输出 torch.Size([1000, 128])


In [12]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import dgl
from dgl.dataloading import MultiLayerFullNeighborSampler, NodeDataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 定义 TransformerConv 层
class TransformerConv(nn.Module):
    def __init__(self,
                 in_feats,
                 out_feats,
                 num_heads,
                 bias=True,
                 allow_zero_in_degree=False,
                 skip_feat=True,
                 gated=True,
                 layer_norm=True,
                 activation=nn.PReLU()):
        super(TransformerConv, self).__init__()
        self._in_src_feats, self._in_dst_feats = in_feats, in_feats
        self._out_feats = out_feats
        self._allow_zero_in_degree = allow_zero_in_degree
        self._num_heads = num_heads

        self.lin_query = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        self.lin_key = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        self.lin_value = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)

        if skip_feat:
            self.skip_feat = nn.Linear(
                self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        else:
            self.skip_feat = None
        if gated:
            self.gate = nn.Linear(
                3 * self._out_feats * self._num_heads, 1, bias=bias)
        else:
            self.gate = None
        if layer_norm:
            self.layer_norm = nn.LayerNorm(self._out_feats * self._num_heads)
        else:
            self.layer_norm = None
        self.activation = activation

    def forward(self, graph, feat, get_attention=False):
        graph = graph.local_var()

        if not self._allow_zero_in_degree:
            if (graph.in_degrees() == 0).any():
                raise ValueError('There are 0-in-degree nodes in the graph.')

        if isinstance(feat, tuple):
            h_src = feat[0]
            h_dst = feat[1]
        else:
            h_src = feat
            h_dst = h_src[:graph.number_of_dst_nodes()]

        q_src = self.lin_query(
            h_src).view(-1, self._num_heads, self._out_feats)
        k_dst = self.lin_key(h_dst).view(-1, self._num_heads, self._out_feats)
        v_src = self.lin_value(
            h_src).view(-1, self._num_heads, self._out_feats)

        graph.srcdata.update({'ft': q_src, 'ft_v': v_src})
        graph.dstdata.update({'ft': k_dst})

        graph.apply_edges(dgl.function.u_dot_v('ft', 'ft', 'a'))
        graph.edata['sa'] = dgl.nn.functional.edge_softmax(
            graph, graph.edata['a'] / self._out_feats ** 0.5)

        graph.update_all(dgl.function.u_mul_e('ft_v', 'sa', 'attn'),
                         dgl.function.sum('attn', 'agg_u'))

        rst = graph.dstdata['agg_u'].reshape(-1,
                                             self._out_feats * self._num_heads)

        if self.skip_feat is not None:
            skip_feat = self.skip_feat(feat[:graph.number_of_dst_nodes()])
            if self.gate is not None:
                gate = torch.sigmoid(
                    self.gate(
                        torch.concat([skip_feat, rst, skip_feat - rst], dim=-1)))
                rst = gate * skip_feat + (1 - gate) * rst
            else:
                rst = skip_feat + rst

        if self.layer_norm is not None:
            rst = self.layer_norm(rst)

        if self.activation is not None:
            rst = self.activation(rst)

        if get_attention:
            return rst, graph.edata['sa']
        else:
            return rst

# 定义 GTAN 模型
class GTAN(nn.Module):
    def __init__(self, in_feats, hidden_dim, n_layers, n_classes, heads):
        super(GTAN, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(TransformerConv(in_feats=in_feats,
                                           out_feats=hidden_dim,
                                           num_heads=heads[0]))
        for l in range(0, (n_layers - 1)):
            self.layers.append(TransformerConv(in_feats=hidden_dim * heads[l - 1],
                                               out_feats=hidden_dim,
                                               num_heads=heads[l]))
        self.fc = nn.Linear(hidden_dim * heads[-1], n_classes)

    def forward(self, g, features):
        h = features
        for layer in self.layers:
            h = layer(g, h)
        h = self.fc(h)
        return h

# 随机生成类似 FFSD 的数据集
def generate_random_ffsd_data(num_nodes, num_features):
    data = {
        "Source": np.random.randint(0, 10, num_nodes),
        "Target": np.random.randint(0, 10, num_nodes),
        "Location": np.random.randint(0, 5, num_nodes),
        "Type": np.random.randint(0, 3, num_nodes),
        "Amount": np.random.randn(num_nodes),
        "Time": np.random.randint(0, 100, num_nodes),
        "Labels": np.random.randint(0, 2, num_nodes)
    }
    df = pd.DataFrame(data)
    cal_list = ["Source", "Target", "Location", "Type"]
    for col in cal_list:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].apply(str).values)
    feat_data = df.drop("Labels", axis=1)
    labels = df["Labels"]
    return feat_data, labels

# 构建图
def build_graph(feat_data):
    alls = []
    allt = []
    pair = ["Source", "Target", "Location", "Type"]
    for column in pair:
        src, tgt = [], []
        edge_per_trans = 3
        for c_id, c_df in feat_data.groupby(column):
            c_df = c_df.sort_values(by="Time")
            df_len = len(c_df)
            sorted_idxs = c_df.index
            src.extend([sorted_idxs[i] for i in range(df_len)
                        for j in range(edge_per_trans) if i + j < df_len])
            tgt.extend([sorted_idxs[i + j] for i in range(df_len)
                        for j in range(edge_per_trans) if i + j < df_len])
        alls.extend(src)
        allt.extend(tgt)
    alls = np.array(alls)
    allt = np.array(allt)
    g = dgl.graph((alls, allt))
    return g

# 训练模型
def train_model(model, g, features, labels, train_idx, test_idx, epochs=100, lr=0.001):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        logits = model(g, features)
        loss = criterion(logits[train_idx], labels[train_idx])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            model.eval()
            with torch.no_grad():
                logits = model(g, features)
                train_preds = torch.argmax(logits[train_idx], dim=1)
                test_preds = torch.argmax(logits[test_idx], dim=1)
                train_acc = (train_preds == labels[train_idx]).float().mean()
                test_acc = (test_preds == labels[test_idx]).float().mean()
                print(f'Epoch {epoch}, Loss: {loss.item()}, Train Acc: {train_acc.item()}, Test Acc: {test_acc.item()}')

# 主函数
def main():
    num_nodes = 1000
    num_features = 6
    feat_data, labels = generate_random_ffsd_data(num_nodes, num_features) # 生成随机数据
    g = build_graph(feat_data)

    features = torch.from_numpy(feat_data.to_numpy()).float()
    labels = torch.from_numpy(labels.to_numpy()).long()

    train_idx, test_idx = train_test_split(np.arange(num_nodes), test_size=0.2, random_state=42)

    in_feats = num_features
    hidden_dim = 16
    n_layers = 2
    n_classes = 2
    heads = [2, 2]

    model = GTAN(in_feats, hidden_dim, n_layers, n_classes, heads)
    train_model(model, g, features, labels, train_idx, test_idx)

if __name__ == "__main__":
    main()

Epoch 0, Loss: 0.7450714707374573, Train Acc: 0.48625001311302185, Test Acc: 0.47999998927116394
Epoch 10, Loss: 0.6960091590881348, Train Acc: 0.5174999833106995, Test Acc: 0.49000000953674316
Epoch 20, Loss: 0.6915732026100159, Train Acc: 0.5099999904632568, Test Acc: 0.45500001311302185
Epoch 30, Loss: 0.6888428330421448, Train Acc: 0.5262500047683716, Test Acc: 0.5099999904632568
Epoch 40, Loss: 0.6863663196563721, Train Acc: 0.543749988079071, Test Acc: 0.48500001430511475
Epoch 50, Loss: 0.6836202144622803, Train Acc: 0.5362499952316284, Test Acc: 0.5249999761581421
Epoch 60, Loss: 0.6804350018501282, Train Acc: 0.5462499856948853, Test Acc: 0.4950000047683716
Epoch 70, Loss: 0.6767117381095886, Train Acc: 0.5487499833106995, Test Acc: 0.47999998927116394
Epoch 80, Loss: 0.6723849773406982, Train Acc: 0.5550000071525574, Test Acc: 0.4699999988079071
Epoch 90, Loss: 0.667324423789978, Train Acc: 0.5562499761581421, Test Acc: 0.48500001430511475


In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import dgl
from dgl.dataloading import MultiLayerFullNeighborSampler, NodeDataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 定义 TransformerConv 层
class TransformerConv(nn.Module):
    def __init__(self,
                 in_feats,
                 out_feats,
                 num_heads,
                 bias=True,
                 allow_zero_in_degree=False,
                 skip_feat=True,
                 gated=True,
                 layer_norm=True,
                 activation=nn.PReLU()):
        super(TransformerConv, self).__init__()
        self._in_src_feats, self._in_dst_feats = in_feats, in_feats
        self._out_feats = out_feats
        self._allow_zero_in_degree = allow_zero_in_degree
        self._num_heads = num_heads

        self.lin_query = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        self.lin_key = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        self.lin_value = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)

        if skip_feat:
            self.skip_feat = nn.Linear(
                self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        else:
            self.skip_feat = None
        if gated:
            self.gate = nn.Linear(
                3 * self._out_feats * self._num_heads, 1, bias=bias)
        else:
            self.gate = None
        if layer_norm:
            self.layer_norm = nn.LayerNorm(self._out_feats * self._num_heads)
        else:
            self.layer_norm = None
        self.activation = activation

    def forward(self, graph, feat, get_attention=False):
        graph = graph.local_var()

        if not self._allow_zero_in_degree:
            if (graph.in_degrees() == 0).any():
                raise ValueError('There are 0-in-degree nodes in the graph.')

        if isinstance(feat, tuple):
            h_src = feat[0]
            h_dst = feat[1]
        else:
            h_src = feat
            h_dst = h_src[:graph.number_of_dst_nodes()]

        q_src = self.lin_query(
            h_src).view(-1, self._num_heads, self._out_feats)
        k_dst = self.lin_key(h_dst).view(-1, self._num_heads, self._out_feats)
        v_src = self.lin_value(
            h_src).view(-1, self._num_heads, self._out_feats)

        graph.srcdata.update({'ft': q_src, 'ft_v': v_src})
        graph.dstdata.update({'ft': k_dst})

        graph.apply_edges(dgl.function.u_dot_v('ft', 'ft', 'a'))
        graph.edata['sa'] = dgl.nn.functional.edge_softmax(
            graph, graph.edata['a'] / self._out_feats ** 0.5)

        graph.update_all(dgl.function.u_mul_e('ft_v', 'sa', 'attn'),
                         dgl.function.sum('attn', 'agg_u'))

        rst = graph.dstdata['agg_u'].reshape(-1,
                                             self._out_feats * self._num_heads)

        if self.skip_feat is not None:
            skip_feat = self.skip_feat(feat[:graph.number_of_dst_nodes()])
            if self.gate is not None:
                gate = torch.sigmoid(
                    self.gate(
                        torch.concat([skip_feat, rst, skip_feat - rst], dim=-1)))
                rst = gate * skip_feat + (1 - gate) * rst
            else:
                rst = skip_feat + rst

        if self.layer_norm is not None:
            rst = self.layer_norm(rst)

        if self.activation is not None:
            rst = self.activation(rst)

        if get_attention:
            return rst, graph.edata['sa']
        else:
            return rst

# 定义 GTAN 模型
class GTAN(nn.Module):
    def __init__(self, in_feats, hidden_dim, n_layers, n_classes, heads):
        super(GTAN, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(TransformerConv(in_feats=in_feats,
                                           out_feats=hidden_dim,
                                           num_heads=heads[0]))
        for l in range(0, (n_layers - 1)):
            self.layers.append(TransformerConv(in_feats=hidden_dim * heads[l - 1],
                                               out_feats=hidden_dim,
                                               num_heads=heads[l]))
        self.fc = nn.Linear(hidden_dim * heads[-1], n_classes)

    def forward(self, g, features):
        h = features
        for layer in self.layers:
            h = layer(g, h)
        h = self.fc(h)
        return h

# 随机生成类似 FFSD 的数据集
def generate_random_ffsd_data(num_nodes, num_features):
    data = {
        "Source": np.random.randint(0, 10, num_nodes),
        "Target": np.random.randint(0, 10, num_nodes),
        "Location": np.random.randint(0, 5, num_nodes),
        "Type": np.random.randint(0, 3, num_nodes),
        "Amount": np.random.randn(num_nodes),
        "Time": np.random.randint(0, 100, num_nodes),
        "Labels": np.random.randint(0, 2, num_nodes)
    }
    df = pd.DataFrame(data)
    cal_list = ["Source", "Target", "Location", "Type"]
    for col in cal_list:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].apply(str).values)
    feat_data = df.drop("Labels", axis=1)
    labels = df["Labels"]
    return feat_data, labels

# 构建图
def build_graph(feat_data):
    alls = []
    allt = []
    pair = ["Source", "Target", "Location", "Type"]
    for column in pair:
        src, tgt = [], []
        edge_per_trans = 3
        for c_id, c_df in feat_data.groupby(column):
            c_df = c_df.sort_values(by="Time")
            df_len = len(c_df)
            sorted_idxs = c_df.index
            src.extend([sorted_idxs[i] for i in range(df_len)
                        for j in range(edge_per_trans) if i + j < df_len])
            tgt.extend([sorted_idxs[i + j] for i in range(df_len)
                        for j in range(edge_per_trans) if i + j < df_len])
        alls.extend(src)
        allt.extend(tgt)
    alls = np.array(alls)
    allt = np.array(allt)
    g = dgl.graph((alls, allt))
    return g

# 训练模型
def train_model(model, g, features, labels, train_idx, test_idx, epochs=100, lr=0.001):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        logits = model(g, features)
        loss = criterion(logits[train_idx], labels[train_idx])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            model.eval()
            with torch.no_grad():
                logits = model(g, features)
                train_preds = torch.argmax(logits[train_idx], dim=1)
                test_preds = torch.argmax(logits[test_idx], dim=1)
                train_acc = (train_preds == labels[train_idx]).float().mean()
                test_acc = (test_preds == labels[test_idx]).float().mean()
                print(f'Epoch {epoch}, Loss: {loss.item()}, Train Acc: {train_acc.item()}, Test Acc: {test_acc.item()}')

# 主函数
def main():
    num_nodes = 1000
    num_features = 6
    feat_data, labels = generate_random_ffsd_data(num_nodes, num_features)
    g = build_graph(feat_data)

    features = torch.from_numpy(feat_data.to_numpy()).float()
    labels = torch.from_numpy(labels.to_numpy()).long()

    train_idx, test_idx = train_test_split(np.arange(num_nodes), test_size=0.2, random_state=42)

    in_feats = num_features
    hidden_dim = 16
    n_layers = 2
    n_classes = 2
    heads = [2, 2]

    model = GTAN(in_feats, hidden_dim, n_layers, n_classes, heads)
    train_model(model, g, features, labels, train_idx, test_idx)

    # 保存模型
    torch.save(model.state_dict(), 'gtan_model.pth')

    # 模拟新数据
    new_num_nodes = 100
    new_feat_data, _ = generate_random_ffsd_data(new_num_nodes, num_features)
    new_g = build_graph(new_feat_data)
    new_features = torch.from_numpy(new_feat_data.to_numpy()).float()

    # 加载模型
    loaded_model = GTAN(in_feats, hidden_dim, n_layers, n_classes, heads)
    loaded_model.load_state_dict(torch.load('gtan_model.pth'))
    loaded_model.eval()

    # 进行预测
    with torch.no_grad():
        logits = loaded_model(new_g, new_features)
        predictions = torch.argmax(logits, dim=1)
        print("预测结果:", predictions.numpy())

if __name__ == "__main__":
    main()

Epoch 0, Loss: 0.753218412399292, Train Acc: 0.5099999904632568, Test Acc: 0.47999998927116394
Epoch 10, Loss: 0.695364236831665, Train Acc: 0.5199999809265137, Test Acc: 0.5699999928474426
Epoch 20, Loss: 0.686575710773468, Train Acc: 0.5099999904632568, Test Acc: 0.47999998927116394
Epoch 30, Loss: 0.6851438283920288, Train Acc: 0.5299999713897705, Test Acc: 0.5799999833106995
Epoch 40, Loss: 0.6843498349189758, Train Acc: 0.5350000262260437, Test Acc: 0.5849999785423279
Epoch 50, Loss: 0.6827506422996521, Train Acc: 0.53125, Test Acc: 0.5799999833106995
Epoch 60, Loss: 0.680972695350647, Train Acc: 0.5400000214576721, Test Acc: 0.5799999833106995
Epoch 70, Loss: 0.6784475445747375, Train Acc: 0.5475000143051147, Test Acc: 0.5849999785423279
Epoch 80, Loss: 0.6747417449951172, Train Acc: 0.5375000238418579, Test Acc: 0.5649999976158142
Epoch 90, Loss: 0.6698616743087769, Train Acc: 0.550000011920929, Test Acc: 0.5600000023841858
预测结果: [0 1 1 0 1 1 1 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import dgl
from dgl.dataloading import MultiLayerFullNeighborSampler, NodeDataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 定义 TransformerConv 层
class TransformerConv(nn.Module):
    def __init__(self,
                 in_feats,  # 输入特征的维度
                 out_feats,  # 输出特征的维度
                 num_heads,  # 注意力头的数量
                 bias=True,  # 是否使用偏置项
                 allow_zero_in_degree=False,  # 是否允许图中存在入度为 0 的节点
                 skip_feat=True,  # 是否使用跳跃连接
                 gated=True,  # 是否使用门控机制
                 layer_norm=True,  # 是否使用层归一化
                 activation=nn.PReLU()):  # 激活函数
        super(TransformerConv, self).__init__()
        # 输入源节点和目标节点的特征维度
        self._in_src_feats, self._in_dst_feats = in_feats, in_feats
        self._out_feats = out_feats
        self._allow_zero_in_degree = allow_zero_in_degree
        self._num_heads = num_heads

        # 定义线性层，用于计算查询（query）、键（key）和值（value）
        self.lin_query = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        self.lin_key = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        self.lin_value = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)

        # 如果使用跳跃连接，定义线性层
        if skip_feat:
            self.skip_feat = nn.Linear(
                self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        else:
            self.skip_feat = None
        # 如果使用门控机制，定义线性层
        if gated:
            self.gate = nn.Linear(
                3 * self._out_feats * self._num_heads, 1, bias=bias)
        else:
            self.gate = None
        # 如果使用层归一化，定义层归一化层
        if layer_norm:
            self.layer_norm = nn.LayerNorm(self._out_feats * self._num_heads)
        else:
            self.layer_norm = None
        self.activation = activation

    def forward(self, graph, feat, get_attention=False):
        # 创建图的本地副本
        graph = graph.local_var()

        # 如果不允许存在入度为 0 的节点，检查图中是否存在此类节点
        if not self._allow_zero_in_degree:
            if (graph.in_degrees() == 0).any():
                raise ValueError('There are 0-in-degree nodes in the graph.')

        # 如果输入特征是元组，分别获取源节点和目标节点的特征
        if isinstance(feat, tuple):
            h_src = feat[0]
            h_dst = feat[1]
        else:
            h_src = feat
            h_dst = h_src[:graph.number_of_dst_nodes()]

        # 计算查询、键和值
        q_src = self.lin_query(
            h_src).view(-1, self._num_heads, self._out_feats)
        k_dst = self.lin_key(h_dst).view(-1, self._num_heads, self._out_feats)
        v_src = self.lin_value(
            h_src).view(-1, self._num_heads, self._out_feats)

        # 将查询和值存储在源节点数据中，将键存储在目标节点数据中
        graph.srcdata.update({'ft': q_src, 'ft_v': v_src})
        graph.dstdata.update({'ft': k_dst})

        # 计算边的注意力分数
        graph.apply_edges(dgl.function.u_dot_v('ft', 'ft', 'a'))
        # 对注意力分数进行归一化
        graph.edata['sa'] = dgl.nn.functional.edge_softmax(
            graph, graph.edata['a'] / self._out_feats ** 0.5)

        # 聚合邻居节点的特征
        graph.update_all(dgl.function.u_mul_e('ft_v', 'sa', 'attn'),
                         dgl.function.sum('attn', 'agg_u'))

        # 重塑聚合后的特征
        rst = graph.dstdata['agg_u'].reshape(-1,
                                             self._out_feats * self._num_heads)

        # 如果使用跳跃连接
        if self.skip_feat is not None:
            skip_feat = self.skip_feat(feat[:graph.number_of_dst_nodes()])
            # 如果使用门控机制
            if self.gate is not None:
                # 计算门控值
                gate = torch.sigmoid(
                    self.gate(
                        torch.concat([skip_feat, rst, skip_feat - rst], dim=-1)))
                # 结合跳跃连接和聚合后的特征
                rst = gate * skip_feat + (1 - gate) * rst
            else:
                rst = skip_feat + rst

        # 如果使用层归一化
        if self.layer_norm is not None:
            rst = self.layer_norm(rst)

        # 如果使用激活函数
        if self.activation is not None:
            rst = self.activation(rst)

        # 如果需要返回注意力分数
        if get_attention:
            return rst, graph.edata['sa']
        else:
            return rst

# 定义 GTAN 模型
class GTAN(nn.Module):
    def __init__(self, in_feats, hidden_dim, n_layers, n_classes, heads):
        super(GTAN, self).__init__()
        self.layers = nn.ModuleList()
        # 添加第一层 TransformerConv 层
        self.layers.append(TransformerConv(in_feats=in_feats,
                                           out_feats=hidden_dim,
                                           num_heads=heads[0]))
        # 添加后续的 TransformerConv 层
        for l in range(0, (n_layers - 1)):
            self.layers.append(TransformerConv(in_feats=hidden_dim * heads[l - 1],
                                               out_feats=hidden_dim,
                                               num_heads=heads[l]))
        # 定义全连接层，用于输出分类结果
        self.fc = nn.Linear(hidden_dim * heads[-1], n_classes)

    def forward(self, g, features):
        h = features
        # 依次通过 TransformerConv 层
        for layer in self.layers:
            h = layer(g, h)
        # 通过全连接层
        h = self.fc(h)
        return h

# 随机生成类似 FFSD 的数据集
def generate_random_ffsd_data(num_nodes, num_features):
    data = {
        "Source": np.random.randint(0, 10, num_nodes),
        "Target": np.random.randint(0, 10, num_nodes),
        "Location": np.random.randint(0, 5, num_nodes),
        "Type": np.random.randint(0, 3, num_nodes),
        "Amount": np.random.randn(num_nodes),
        "Time": np.random.randint(0, 100, num_nodes),
        "Labels": np.random.randint(0, 2, num_nodes)
    }
    # 创建 DataFrame
    df = pd.DataFrame(data)
    cal_list = ["Source", "Target", "Location", "Type"]
    # 对分类特征进行编码
    for col in cal_list:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].apply(str).values)
    # 提取特征数据
    feat_data = df.drop("Labels", axis=1)
    # 提取标签数据
    labels = df["Labels"]
    return feat_data, labels

# 构建图
def build_graph(feat_data):
    alls = []
    allt = []
    pair = ["Source", "Target", "Location", "Type"]
    for column in pair:
        src, tgt = [], []
        edge_per_trans = 3
        # 按列进行分组
        for c_id, c_df in feat_data.groupby(column):
            # 按时间排序
            c_df = c_df.sort_values(by="Time")
            df_len = len(c_df)
            sorted_idxs = c_df.index
            # 构建边
            src.extend([sorted_idxs[i] for i in range(df_len)
                        for j in range(edge_per_trans) if i + j < df_len])
            tgt.extend([sorted_idxs[i + j] for i in range(df_len)
                        for j in range(edge_per_trans) if i + j < df_len])
        alls.extend(src)
        allt.extend(tgt)
    alls = np.array(alls)
    allt = np.array(allt)
    # 创建图
    g = dgl.graph((alls, allt))
    return g

# 训练模型
def train_model(model, g, features, labels, train_idx, test_idx, epochs=100, lr=0.001):
    # 定义优化器
    optimizer = optim.Adam(model.parameters(), lr=lr)
    # 定义损失函数
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        # 模型训练模式
        model.train()
        # 前向传播
        logits = model(g, features)
        # 计算损失
        loss = criterion(logits[train_idx], labels[train_idx])

        # 梯度清零
        optimizer.zero_grad()
        # 反向传播
        loss.backward()
        # 更新参数
        optimizer.step()

        # 每 10 个 epoch 输出一次训练和测试准确率
        if epoch % 10 == 0:
            # 模型评估模式
            model.eval()
            with torch.no_grad():
                logits = model(g, features)
                # 计算训练集预测结果
                train_preds = torch.argmax(logits[train_idx], dim=1)
                # 计算测试集预测结果
                test_preds = torch.argmax(logits[test_idx], dim=1)
                # 计算训练集准确率
                train_acc = (train_preds == labels[train_idx]).float().mean()
                # 计算测试集准确率
                test_acc = (test_preds == labels[test_idx]).float().mean()
                print(f'Epoch {epoch}, Loss: {loss.item()}, Train Acc: {train_acc.item()}, Test Acc: {test_acc.item()}')

# 主函数
def main():
    num_nodes = 1000
    num_features = 6
    # 生成随机数据集
    feat_data, labels = generate_random_ffsd_data(num_nodes, num_features)
    # 构建图
    g = build_graph(feat_data)

    # 将特征数据转换为 PyTorch 张量
    features = torch.from_numpy(feat_data.to_numpy()).float()
    # 将标签数据转换为 PyTorch 张量
    labels = torch.from_numpy(labels.to_numpy()).long()

    # 划分训练集和测试集
    train_idx, test_idx = train_test_split(np.arange(num_nodes), test_size=0.2, random_state=42)

    in_feats = num_features # 输入特征的维度
    hidden_dim = 16   # 隐藏层的维度
    n_layers = 2 # TransformerConv 层的数量
    n_classes = 2 #      输出的类别数
    heads = [2, 2] # 每个 TransformerConv 层的注意力头的数量

    # 创建 GTAN 模型
    model = GTAN(in_feats, hidden_dim, n_layers, n_classes, heads)
    # 训练模型
    train_model(model, g, features, labels, train_idx, test_idx)

    # 保存模型
    torch.save(model.state_dict(), 'gtan_model.pth')

    # 模拟新数据
    new_num_nodes = 100
    new_feat_data, _ = generate_random_ffsd_data(new_num_nodes, num_features)
    # 构建新图
    new_g = build_graph(new_feat_data)
    # 将新特征数据转换为 PyTorch 张量
    new_features = torch.from_numpy(new_feat_data.to_numpy()).float()

    # 加载模型
    loaded_model = GTAN(in_feats, hidden_dim, n_layers, n_classes, heads)
    loaded_model.load_state_dict(torch.load('gtan_model.pth'))
    # 模型评估模式
    loaded_model.eval()

    # 进行预测
    with torch.no_grad():
        logits = loaded_model(new_g, new_features)
        predictions = torch.argmax(logits, dim=1)
        print("预测结果:", predictions.numpy())

if __name__ == "__main__":
    main()

Epoch 0, Loss: 0.7000607252120972, Train Acc: 0.5024999976158142, Test Acc: 0.49000000953674316
Epoch 10, Loss: 0.6918095350265503, Train Acc: 0.5412499904632568, Test Acc: 0.5
Epoch 20, Loss: 0.6888192892074585, Train Acc: 0.5462499856948853, Test Acc: 0.4699999988079071
Epoch 30, Loss: 0.6870434284210205, Train Acc: 0.5337499976158142, Test Acc: 0.4749999940395355
Epoch 40, Loss: 0.6836686730384827, Train Acc: 0.5512499809265137, Test Acc: 0.5
Epoch 50, Loss: 0.6786544322967529, Train Acc: 0.5649999976158142, Test Acc: 0.5149999856948853
Epoch 60, Loss: 0.6714608669281006, Train Acc: 0.5737500190734863, Test Acc: 0.5249999761581421
Epoch 70, Loss: 0.6621057391166687, Train Acc: 0.5950000286102295, Test Acc: 0.5299999713897705
Epoch 80, Loss: 0.6528637409210205, Train Acc: 0.5962499976158142, Test Acc: 0.5149999856948853
Epoch 90, Loss: 0.6451188921928406, Train Acc: 0.6137499809265137, Test Acc: 0.5199999809265137
预测结果: [1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 0

In [15]:
def generate_random_ffsd_data(num_nodes, num_features):
    data = {
        "Source": np.random.randint(0, 10, num_nodes),
        "Target": np.random.randint(0, 10, num_nodes),
        "Location": np.random.randint(0, 5, num_nodes),
        "Type": np.random.randint(0, 3, num_nodes),
        "Amount": np.random.randn(num_nodes),
        "Time": np.random.randint(0, 100, num_nodes),
        "Labels": np.random.randint(0, 2, num_nodes)
    }
    # 创建 DataFrame
    df = pd.DataFrame(data)
    cal_list = ["Source", "Target", "Location", "Type"]
    # 对分类特征进行编码
    for col in cal_list:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].apply(str).values)
    # 提取特征数据
    feat_data = df.drop("Labels", axis=1)
    # 提取标签数据
    labels = df["Labels"]
    return feat_data, labels

In [26]:
num_nodes = 1000
num_features = 6
# 生成随机数据集
feat_data, labels = generate_random_ffsd_data(num_nodes, num_features)
print(feat_data.shape)
print(feat_data.head())

(1000, 6)
   Source  Target  Location  Type    Amount  Time
0       0       4         3     2 -0.514793    17
1       3       5         0     0 -0.439897    86
2       8       1         0     0  0.536507    24
3       2       9         0     0 -0.176634    97
4       0       1         1     1 -0.843627    73


In [27]:
# 构建图
def build_graph(feat_data):
    alls = []
    allt = []
    pair = ["Source", "Target", "Location", "Type"]
    for column in pair:
        src, tgt = [], []
        edge_per_trans = 3
        # 按列进行分组
        for c_id, c_df in feat_data.groupby(column):
            # 按时间排序
            c_df = c_df.sort_values(by="Time")
            df_len = len(c_df)
            sorted_idxs = c_df.index
            # 构建边
            src.extend([sorted_idxs[i] for i in range(df_len)
                        for j in range(edge_per_trans) if i + j < df_len])
            tgt.extend([sorted_idxs[i + j] for i in range(df_len)
                        for j in range(edge_per_trans) if i + j < df_len])
        alls.extend(src)
        allt.extend(tgt)
    alls = np.array(alls)
    allt = np.array(allt)
    # 创建图
    g = dgl.graph((alls, allt))
    return g

In [28]:
g = build_graph(feat_data)

In [29]:
 # 将特征数据转换为 PyTorch 张量
features = torch.from_numpy(feat_data.to_numpy()).float()
    # 将标签数据转换为 PyTorch 张量
labels = torch.from_numpy(labels.to_numpy()).long()


In [None]:
train_idx, test_idx = train_test_split(np.arange(num_nodes), test_size=0.2, random_state=42)


In [31]:
in_feats = num_features # 输入特征的维度
hidden_dim = 16   # 隐藏层的维度
n_layers = 2 # TransformerConv 层的数量
n_classes = 2 #      输出的类别数
heads = [2, 2] # 每个 TransformerConv 层的注意力头的数量

In [32]:
# 定义 GTAN 模型
class GTAN(nn.Module):
    def __init__(self, in_feats, hidden_dim, n_layers, n_classes, heads):
        super(GTAN, self).__init__()
        self.layers = nn.ModuleList()
        # 添加第一层 TransformerConv 层
        self.layers.append(TransformerConv(in_feats=in_feats,
                                           out_feats=hidden_dim,
                                           num_heads=heads[0]))
        # 添加后续的 TransformerConv 层
        for l in range(0, (n_layers - 1)):
            self.layers.append(TransformerConv(in_feats=hidden_dim * heads[l - 1],
                                               out_feats=hidden_dim,
                                               num_heads=heads[l]))
        # 定义全连接层，用于输出分类结果
        self.fc = nn.Linear(hidden_dim * heads[-1], n_classes)

    def forward(self, g, features):
        h = features
        # 依次通过 TransformerConv 层
        for layer in self.layers:
            h = layer(g, h)
        # 通过全连接层
        h = self.fc(h)
        return h

In [None]:
# 定义 TransformerConv 层
class TransformerConv(nn.Module):
    def __init__(self,
                 in_feats,  # 输入特征的维度
                 out_feats,  # 输出特征的维度
                 num_heads,  # 注意力头的数量
                 bias=True,  # 是否使用偏置项
                 allow_zero_in_degree=False,  # 是否允许图中存在入度为 0 的节点
                 skip_feat=True,  # 是否使用跳跃连接
                 gated=True,  # 是否使用门控机制
                 layer_norm=True,  # 是否使用层归一化
                 activation=nn.PReLU()):  # 激活函数
        super(TransformerConv, self).__init__()
        # 输入源节点和目标节点的特征维度
        self._in_src_feats, self._in_dst_feats = in_feats, in_feats 
        self._out_feats = out_feats
        self._allow_zero_in_degree = allow_zero_in_degree
        self._num_heads = num_heads

        # 定义线性层，用于计算查询（query）、键（key）和值（value）
        self.lin_query = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        self.lin_key = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        self.lin_value = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)

        # 如果使用跳跃连接，定义线性层
        if skip_feat:
            self.skip_feat = nn.Linear(
                self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        else:
            self.skip_feat = None
        # 如果使用门控机制，定义线性层
        if gated:
            self.gate = nn.Linear(
                3 * self._out_feats * self._num_heads, 1, bias=bias)
        else:
            self.gate = None
        # 如果使用层归一化，定义层归一化层
        if layer_norm:
            self.layer_norm = nn.LayerNorm(self._out_feats * self._num_heads)
        else:
            self.layer_norm = None
        self.activation = activation

    def forward(self, graph, feat, get_attention=False):
        # 创建图的本地副本
        graph = graph.local_var()

        # 如果不允许存在入度为 0 的节点，检查图中是否存在此类节点
        if not self._allow_zero_in_degree:
            if (graph.in_degrees() == 0).any():
                raise ValueError('There are 0-in-degree nodes in the graph.')

        # 如果输入特征是元组，分别获取源节点和目标节点的特征
        if isinstance(feat, tuple):
            h_src = feat[0]
            h_dst = feat[1]
        else:
            h_src = feat
            h_dst = h_src[:graph.number_of_dst_nodes()]

        # 计算查询、键和值
        q_src = self.lin_query(
            h_src).view(-1, self._num_heads, self._out_feats)
        k_dst = self.lin_key(h_dst).view(-1, self._num_heads, self._out_feats)
        v_src = self.lin_value(
            h_src).view(-1, self._num_heads, self._out_feats)

        # 将查询和值存储在源节点数据中，将键存储在目标节点数据中
        graph.srcdata.update({'ft': q_src, 'ft_v': v_src})
        graph.dstdata.update({'ft': k_dst})

        # 计算边的注意力分数
        graph.apply_edges(dgl.function.u_dot_v('ft', 'ft', 'a'))
        # 对注意力分数进行归一化
        graph.edata['sa'] = dgl.nn.functional.edge_softmax(
            graph, graph.edata['a'] / self._out_feats ** 0.5)

        # 聚合邻居节点的特征
        graph.update_all(dgl.function.u_mul_e('ft_v', 'sa', 'attn'),
                         dgl.function.sum('attn', 'agg_u'))

        # 重塑聚合后的特征
        rst = graph.dstdata['agg_u'].reshape(-1,
                                             self._out_feats * self._num_heads)

        # 如果使用跳跃连接
        if self.skip_feat is not None:
            skip_feat = self.skip_feat(feat[:graph.number_of_dst_nodes()])
            # 如果使用门控机制
            if self.gate is not None:
                # 计算门控值
                gate = torch.sigmoid(
                    self.gate(
                        torch.concat([skip_feat, rst, skip_feat - rst], dim=-1)))
                # 结合跳跃连接和聚合后的特征
                rst = gate * skip_feat + (1 - gate) * rst
            else:
                rst = skip_feat + rst

        # 如果使用层归一化
        if self.layer_norm is not None:
            rst = self.layer_norm(rst)

        # 如果使用激活函数
        if self.activation is not None:
            rst = self.activation(rst)

        # 如果需要返回注意力分数
        if get_attention:
            return rst, graph.edata['sa']
        else:
            return rst

In [36]:
# 创建 GTAN 模型
model = GTAN(in_feats, hidden_dim, n_layers, n_classes, heads)
    


In [37]:
# 训练模型
train_model(model, g, features, labels, train_idx, test_idx)

Epoch 0, Loss: 0.7389988899230957, Train Acc: 0.5074999928474426, Test Acc: 0.4950000047683716
Epoch 10, Loss: 0.6924529075622559, Train Acc: 0.5237500071525574, Test Acc: 0.5299999713897705
Epoch 20, Loss: 0.6907057762145996, Train Acc: 0.5249999761581421, Test Acc: 0.5649999976158142
Epoch 30, Loss: 0.6886590719223022, Train Acc: 0.543749988079071, Test Acc: 0.5299999713897705
Epoch 40, Loss: 0.6877171993255615, Train Acc: 0.5325000286102295, Test Acc: 0.5299999713897705
Epoch 50, Loss: 0.6867654323577881, Train Acc: 0.5450000166893005, Test Acc: 0.5249999761581421
Epoch 60, Loss: 0.6858637928962708, Train Acc: 0.5425000190734863, Test Acc: 0.5249999761581421
Epoch 70, Loss: 0.6848348379135132, Train Acc: 0.5537499785423279, Test Acc: 0.5099999904632568
Epoch 80, Loss: 0.6834627389907837, Train Acc: 0.5550000071525574, Test Acc: 0.5149999856948853
Epoch 90, Loss: 0.6815904378890991, Train Acc: 0.5612499713897705, Test Acc: 0.5099999904632568


In [38]:
import dgl

# 输入为整数的情况
in_feats = 10
src_feats, dst_feats = dgl.utils.expand_as_pair(in_feats)
print(f"源节点特征维度: {src_feats}, 目标节点特征维度: {dst_feats}")

# 输入为元组的情况
in_feats = (10, 20)
src_feats, dst_feats = dgl.utils.expand_as_pair(in_feats)
print(f"源节点特征维度: {src_feats}, 目标节点特征维度: {dst_feats}")

源节点特征维度: 10, 目标节点特征维度: 10
源节点特征维度: 10, 目标节点特征维度: 20


In [None]:
import torch.nn as nn
class network(nn.Module):
    def __init__(self):
        super(network, self).__init__()
    def __init__(self):
        print("PosEncoding")
    def forward(self, pos):
        print("PosEncoding forward")
        return 123

In [9]:
network()

PosEncoding


AttributeError: 'network' object has no attribute '_modules'

In [14]:
import torch.nn as nn

class network(nn.Module):
    def __init__(self):
        super(network, self).__init__()
        print("init")

    def forward(self, pos):
        print("forward")
        return pos

In [15]:
# 实例化类
net = network()
# 调用forward方法，传入参数
result = net(10)  # 这里传入10作为参数，实际传入的参数可以根据需要调整
print(result)

init
forward
10


In [27]:
import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self, cat_features):
        super(MyModel, self).__init__()
        self.forward_mlp = nn.ModuleList(
            [nn.Linear(4, 4) for i in range(len(cat_features))]
        )
        
    def forward(self, x):
        # x 的形状假设为 (batch_size, num_features, feature_dim)
        # 其中 num_features 是 cat_features 的长度，feature_dim 是每个特征的维度（这里是3）
        outputs = []
        for i, layer in enumerate(self.forward_mlp):
            outputs.append(layer(x[:, i, :]))  # 对每个特征向量应用对应的全连接层
        return torch.stack(outputs, dim=1)  # 将输出堆叠起来，形状为 (batch_size, num_features, feature_dim)

# 示例
cat_features = [1, 2, 3]  # 假设有 3 个类别特征
model = MyModel(cat_features)

# 创建一个输入张量，假设 batch_size=2，num_features=3，feature_dim=3
input_tensor = torch.tensor([
    [[1.0, 2.0, 3.0,4.0], 
     [4.0, 5.0, 6.0, 3.0], 
     [7.0, 8.0, 9.0, 3.0]],
     
    [[10.0, 11.0, 12.0, 3.0],
     [13.0, 14.0, 15.0, 3.0],
     [16.0, 17.0, 18.0, 3.0]]
])

# 前向传播
output = model(input_tensor)
print("Output shape:", output.shape)  # 输出形状应该是 (2, 3, 3)
print("Output tensor:\n", output)

Output shape: torch.Size([2, 3, 4])
Output tensor:
 tensor([[[  0.0615,   1.0786,  -1.8683,   0.7127],
         [  4.1552,   0.0904,   1.1850,  -0.7272],
         [ -2.1136,   0.8901,  -5.4305,   3.6585]],

        [[  4.6945,  -1.2204,  -4.8430,  -2.2240],
         [ 12.3335,   0.6931,   3.1643,  -5.3755],
         [ -3.6933,   0.9422, -11.3417,   8.7332]]], grad_fn=<StackBackward0>)


In [29]:
from dgl.utils import expand_as_pair
expand_as_pair(126)

(126, 126)

In [33]:
in_src_feats, in_dst_feats = expand_as_pair(126)
print(in_src_feats, in_dst_feats)
print(type(in_src_feats))

126 126
<class 'int'>


In [None]:
in_src_feats =1
in_src_feats, in_dst_feats = expand_as_pair(in_feats)


In [4]:
import torch
import torch.nn as nn
import dgl
import dgl.function as fn
from dgl.nn import edge_softmax
import graphviz
import matplotlib.pyplot as plt


class TransformerConv(nn.Module):

    def __init__(self,
                 in_feats,
                 out_feats,
                 num_heads,
                 bias=True,
                 allow_zero_in_degree=False,
                 skip_feat=True,
                 gated=True,
                 layer_norm=True,
                 activation=nn.PReLU()):
        """
        Initialize the transformer layer.
        Attentional weights are jointly optimized in an end-to-end mechanism with graph neural networks and fraud detection networks.
            :param in_feat: the shape of input feature
            :param out_feats: the shape of output feature
            :param num_heads: the number of multi-head attention 
            :param bias: whether to use bias
            :param allow_zero_in_degree: whether to allow zero in degree
            :param skip_feat: whether to skip some feature 
            :param gated: whether to use gate
            :param layer_norm: whether to use layer regularization
            :param activation: the type of activation function   
        """

        super(TransformerConv, self).__init__()
        self._in_src_feats, self._in_dst_feats = in_feats, in_feats
        self._out_feats = out_feats
        self._allow_zero_in_degree = allow_zero_in_degree
        self._num_heads = num_heads

        self.lin_query = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        self.lin_key = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        self.lin_value = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)

        if skip_feat:
            self.skip_feat = nn.Linear(
                self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        else:
            self.skip_feat = None
        if gated:
            self.gate = nn.Linear(
                3 * self._out_feats * self._num_heads, 1, bias=bias)
        else:
            self.gate = None
        if layer_norm:
            self.layer_norm = nn.LayerNorm(self._out_feats * self._num_heads)
        else:
            self.layer_norm = None
        self.activation = activation

    def forward(self, graph, feat, get_attention=False):
        """
        Description: Transformer Graph Convolution
        :param graph: input graph
            :param feat: input feat
            :param get_attention: whether to get attention
        """

        graph = graph.local_var()

        if not self._allow_zero_in_degree:
            if (graph.in_degrees() == 0).any():
                raise ValueError('There are 0-in-degree nodes in the graph, '
                                 'output for those nodes will be invalid. '
                                 'This is harmful for some applications, '
                                 'causing silent performance regression. '
                                 'Adding self-loop on the input graph by '
                                 'calling `g = dgl.add_self_loop(g)` will resolve '
                                 'the issue. Setting ``allow_zero_in_degree`` '
                                 'to be `True` when constructing this module will '
                                 'suppress the check and let the code run.')

        # check if feat is a tuple
        if isinstance(feat, tuple):
            h_src = feat[0]
            h_dst = feat[1]
        else:
            h_src = feat
            h_dst = h_src[:graph.number_of_dst_nodes()]

        # Step 0. q, k, v
        q_src = self.lin_query(
            h_src).view(-1, self._num_heads, self._out_feats)
        k_dst = self.lin_key(h_dst).view(-1, self._num_heads, self._out_feats)
        v_src = self.lin_value(
            h_src).view(-1, self._num_heads, self._out_feats)
        # Assign features to nodes
        graph.srcdata.update({'ft': q_src, 'ft_v': v_src})
        graph.dstdata.update({'ft': k_dst})
        # Step 1. dot product
        graph.apply_edges(fn.u_dot_v('ft', 'ft', 'a'))

        # Step 2. edge softmax to compute attention scores
        graph.edata['sa'] = edge_softmax(
            graph, graph.edata['a'] / self._out_feats ** 0.5)

        # Step 3. Broadcast softmax value to each edge, and aggregate dst node
        graph.update_all(fn.u_mul_e('ft_v', 'sa', 'attn'),
                         fn.sum('attn', 'agg_u'))

        # output results to the destination nodes
        rst = graph.dstdata['agg_u'].reshape(-1,
                                             self._out_feats * self._num_heads)

        if self.skip_feat is not None:
            skip_feat = self.skip_feat(feat[:graph.number_of_dst_nodes()])
            if self.gate is not None:
                gate = torch.sigmoid(
                    self.gate(
                        torch.concat([skip_feat, rst, skip_feat - rst], dim=-1)))
                rst = gate * skip_feat + (1 - gate) * rst
            else:
                rst = skip_feat + rst

        if self.layer_norm is not None:
            rst = self.layer_norm(rst)

        if self.activation is not None:
            rst = self.activation(rst)

        if get_attention:
            return rst, graph.edata['sa']
        else:
            return rst


def visualize_graph(graph, attention_scores=None):
    g = graphviz.Digraph('G', filename='graph.gv')

    # Add nodes
    for i in range(graph.number_of_nodes()):
        g.node(str(i))

    # Add edges with attention scores as labels
    if attention_scores is not None:
        for i, (u, v) in enumerate(graph.edges()):
            score = attention_scores[i].item()
            g.edge(str(u.item()), str(v.item()), label=f'{score:.2f}')
    else:
        for u, v in graph.edges():
            g.edge(str(u.item()), str(v.item()))

    g.view()


def visualize_attention_scores(attention_scores):
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(attention_scores)), attention_scores.squeeze().tolist())
    plt.xlabel('Edge Index')
    plt.ylabel('Attention Score')
    plt.title('Attention Scores')
    plt.show()


# Example usage
if __name__ == "__main__":
    # Create a simple graph
    src = torch.tensor([0, 1, 2])
    dst = torch.tensor([1, 2, 0])
    graph = dgl.graph((src, dst))

    # Create node features
    in_feats = 16
    num_nodes = graph.number_of_nodes()
    feat = torch.randn(num_nodes, in_feats)

    # Create the TransformerConv layer
    out_feats = 8
    num_heads = 2
    conv = TransformerConv(in_feats, out_feats, num_heads)

    # Forward pass and get attention scores
    output, attention_scores = conv(graph, feat, get_attention=True)

    # Visualize the graph with attention scores
    visualize_graph(graph, attention_scores)

    # Visualize the attention scores
    visualize_attention_scores(attention_scores)
    

ValueError: too many values to unpack (expected 2)

In [7]:
import torch
import torch.nn as nn
import dgl
import dgl.function as fn
from dgl.nn import edge_softmax
import graphviz
import matplotlib.pyplot as plt


class TransformerConv(nn.Module):

    def __init__(self,
                 in_feats,
                 out_feats,
                 num_heads,
                 bias=True,
                 allow_zero_in_degree=False,
                 skip_feat=True,
                 gated=True,
                 layer_norm=True,
                 activation=nn.PReLU()):
        """
        Initialize the transformer layer.
        Attentional weights are jointly optimized in an end-to-end mechanism with graph neural networks and fraud detection networks.
            :param in_feat: the shape of input feature
            :param out_feats: the shape of output feature
            :param num_heads: the number of multi-head attention 
            :param bias: whether to use bias
            :param allow_zero_in_degree: whether to allow zero in degree
            :param skip_feat: whether to skip some feature 
            :param gated: whether to use gate
            :param layer_norm: whether to use layer regularization
            :param activation: the type of activation function   
        """

        super(TransformerConv, self).__init__()
        self._in_src_feats, self._in_dst_feats = in_feats, in_feats
        self._out_feats = out_feats
        self._allow_zero_in_degree = allow_zero_in_degree
        self._num_heads = num_heads

        self.lin_query = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        self.lin_key = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        self.lin_value = nn.Linear(
            self._in_src_feats, self._out_feats * self._num_heads, bias=bias)

        if skip_feat:
            self.skip_feat = nn.Linear(
                self._in_src_feats, self._out_feats * self._num_heads, bias=bias)
        else:
            self.skip_feat = None
        if gated:
            self.gate = nn.Linear(
                3 * self._out_feats * self._num_heads, 1, bias=bias)
        else:
            self.gate = None
        if layer_norm:
            self.layer_norm = nn.LayerNorm(self._out_feats * self._num_heads)
        else:
            self.layer_norm = None
        self.activation = activation

    def forward(self, graph, feat, get_attention=False):
        """
        Description: Transformer Graph Convolution
        :param graph: input graph
            :param feat: input feat
            :param get_attention: whether to get attention
        """

        graph = graph.local_var()

        if not self._allow_zero_in_degree:
            if (graph.in_degrees() == 0).any():
                raise ValueError('There are 0-in-degree nodes in the graph, '
                                 'output for those nodes will be invalid. '
                                 'This is harmful for some applications, '
                                 'causing silent performance regression. '
                                 'Adding self-loop on the input graph by '
                                 'calling `g = dgl.add_self_loop(g)` will resolve '
                                 'the issue. Setting ``allow_zero_in_degree`` '
                                 'to be `True` when constructing this module will '
                                 'suppress the check and let the code run.')

        # check if feat is a tuple
        if isinstance(feat, tuple):
            h_src = feat[0]
            h_dst = feat[1]
        else:
            h_src = feat
            h_dst = h_src[:graph.number_of_dst_nodes()]

        # Step 0. q, k, v
        q_src = self.lin_query(
            h_src).view(-1, self._num_heads, self._out_feats)
        k_dst = self.lin_key(h_dst).view(-1, self._num_heads, self._out_feats)
        v_src = self.lin_value(
            h_src).view(-1, self._num_heads, self._out_feats)
        # Assign features to nodes
        graph.srcdata.update({'ft': q_src, 'ft_v': v_src})
        graph.dstdata.update({'ft': k_dst})
        # Step 1. dot product
        graph.apply_edges(fn.u_dot_v('ft', 'ft', 'a'))

        # Step 2. edge softmax to compute attention scores
        graph.edata['sa'] = edge_softmax(
            graph, graph.edata['a'] / self._out_feats ** 0.5)

        # Step 3. Broadcast softmax value to each edge, and aggregate dst node
        graph.update_all(fn.u_mul_e('ft_v', 'sa', 'attn'),
                         fn.sum('attn', 'agg_u'))

        # output results to the destination nodes
        rst = graph.dstdata['agg_u'].reshape(-1,
                                             self._out_feats * self._num_heads)

        if self.skip_feat is not None:
            skip_feat = self.skip_feat(feat[:graph.number_of_dst_nodes()])
            if self.gate is not None:
                gate = torch.sigmoid(
                    self.gate(
                        torch.concat([skip_feat, rst, skip_feat - rst], dim=-1)))
                rst = gate * skip_feat + (1 - gate) * rst
            else:
                rst = skip_feat + rst

        if self.layer_norm is not None:
            rst = self.layer_norm(rst)

        if self.activation is not None:
            rst = self.activation(rst)

        if get_attention:
            return rst, graph.edata['sa']
        else:
            return rst


def visualize_graph(graph, attention_scores=None):
    g = graphviz.Digraph('G', filename='graph.gv')

    # Add nodes
    for i in range(graph.number_of_nodes()):
        g.node(str(i))

    # Add edges with attention scores as labels
    if attention_scores is not None:
        # 对多头注意力分数进行平均
        if len(attention_scores.shape) > 1:
            attention_scores = attention_scores.mean(dim=1)
        src, dst = graph.edges()
        for i in range(len(src)):
            score = attention_scores[i].item()
            g.edge(str(src[i].item()), str(dst[i].item()), label=f'{score:.2f}')
    else:
        src, dst = graph.edges()
        for i in range(len(src)):
            g.edge(str(src[i].item()), str(dst[i].item()))

    g.view()


def visualize_attention_scores(attention_scores):
    # 对多头注意力分数进行平均
    if len(attention_scores.shape) > 1:
        attention_scores = attention_scores.mean(dim=1)
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(attention_scores)), attention_scores.squeeze().tolist())
    plt.xlabel('Edge Index')
    plt.ylabel('Attention Score')
    plt.title('Attention Scores')
    plt.show()


# Example usage
if __name__ == "__main__":
    # Create a simple graph
    src = torch.tensor([0, 1, 2])
    dst = torch.tensor([1, 2, 0])
    graph = dgl.graph((src, dst))

    # Create node features
    in_feats = 16
    num_nodes = graph.number_of_nodes()
    feat = torch.randn(num_nodes, in_feats)

    # Create the TransformerConv layer
    out_feats = 8
    num_heads = 2
    conv = TransformerConv(in_feats, out_feats, num_heads)

    # Forward pass and get attention scores
    output, attention_scores = conv(graph, feat, get_attention=True)

    # Visualize the graph with attention scores
    visualize_graph(graph, attention_scores)

    # Visualize the attention scores
    visualize_attention_scores(attention_scores)

Error: Could not open "graph.gv.pdf" for writing : Permission denied


CalledProcessError: Command '['dot', '-Kdot', '-Tpdf', '-O', 'graph.gv']' returned non-zero exit status 1. [stderr: b'Error: Could not open "graph.gv.pdf" for writing : Permission denied\r\n']