In [1]:
############################################################################
##
## Copyright (C) 2021 NVIDIA Corporation.  All rights reserved.
##
## NVIDIA Sample Code
##
## Please refer to the NVIDIA end user license agreement (EULA) associated
## with this source code for terms and conditions that govern your use of
## this software. Any use, reproduction, disclosure, or distribution of
## this software and related documentation outside the terms of the EULA
## is strictly prohibited.
##
############################################################################

In [1]:
import cudf
import dgl
from dgl.data import DGLDataset
import torch
import os
from torch.utils.dlpack import from_dlpack
from torch import nn
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'

Using backend: pytorch


In [2]:
types = {
    'tx_id': "int32",
    'timestamp': "int32",
    'sender_account_id': "int32",
    'receiver_account_id': "int32",
    'tx_amount': "float32",
    'sender_init_balance': "float32",
    'receiver_init_balance': "float32",
    'alert_id': "int32",
    'sender_is_fraud': "int32",
    'receiver_is_fraud': "int32",
    'is_fraud': "int32",
    'sender_fin_balance': "float32",
    'receiver_fin_balance': "float32"
}

In [3]:
def load_transactions():
    df0 = cudf.read_csv('1m_cleaned/fixed_transactions_part1.csv', dtype=types)
    df1 = cudf.read_csv('1m_cleaned/fixed_transactions_part2.csv', dtype=types)
    df2 = cudf.read_csv('1m_cleaned/fixed_transactions_part3.csv', dtype=types)
    return cudf.concat([df0, df1, df2])
    #return cudf.concat([df0, df1])


In [4]:
class AMLDataset(DGLDataset):
    def __init__(self):
        super().__init__(name='aml')

    def process(self):
        df = load_transactions()
        self.graph = g = dgl.graph((df.sender_account_id, df.receiver_account_id), num_nodes=df.sender_account_id.nunique(), 
                                   device=device)
        self.graph.edata['label'] = torch.cuda.FloatTensor(df.is_fraud.values)
        
        ins = self.graph.in_degrees()
        outs = self.graph.out_degrees()        
        self.graph.ndata['feat'] = torch.vstack([ins, outs]).permute([1, 0]).type(torch.float32)
        
        deg_means = self.graph.ndata['feat'].mean(axis=0)
        deg_stds = self.graph.ndata['feat'].std(axis=0)
        self.graph.ndata['feat'] = (self.graph.ndata['feat'] - deg_means) / deg_stds 

        mean_val = df['tx_amount'].mean()
        std = df['tx_amount'].std()
        self.graph.edata['feat'] = torch.cuda.FloatTensor((df['tx_amount'] - mean_val) / std)
               
        number_edges = self.graph.num_edges()
        n_train = int(number_edges * 0.6)
        n_val = int(number_edges * 0.2)
        train_mask = torch.zeros(number_edges, dtype=torch.bool, device=device)
        val_mask = torch.zeros(number_edges, dtype=torch.bool, device=device)
        test_mask = torch.zeros(number_edges, dtype=torch.bool, device=device)
        train_mask[:n_train] = True
        val_mask[n_train:n_train + n_val] = True
        test_mask[n_train + n_val:] = True
        self.graph.edata['train_mask'] = train_mask
        self.graph.edata['val_mask'] = val_mask
        self.graph.edata['test_mask'] = test_mask

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1


In [5]:
import dgl.function as fn

class SAGEConv(nn.Module):
    """Graph convolution module used by the GraphSAGE model.

    Parameters
    ----------
    in_feat : int
        Input feature size.
    out_feat : int
        Output feature size.
    """
    def __init__(self, in_feat, out_feat):
        super(SAGEConv, self).__init__()
        # A linear submodule for projecting the input and neighbor feature to the output.
        self.linear = nn.Linear(in_feat * 2, out_feat)

    def forward(self, g, h):
        """Forward computation

        Parameters
        ----------
        g : Graph
            The input graph.
        h : Tensor
            The input node feature.
        """
        with g.local_scope():
            g.ndata['h'] = h
            # update_all is a message passing API.
            g.update_all(message_func=dgl.function.e_mul_u('feat', 'h', 'm'), reduce_func=fn.mean('m', 'h_N'))
            h_N = g.ndata['h_N']
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)

In [6]:
class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [7]:
class Model(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(Model, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats)
        self.conv2 = SAGEConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [8]:
a = AMLDataset()

In [9]:
a

<__main__.AMLDataset at 0x2b931a5167c0>

In [10]:
g = a[0]

In [11]:
g

Graph(num_nodes=1000000, num_edges=124703184,
      ndata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float32)}
      edata_schemes={'label': Scheme(shape=(), dtype=torch.float32), 'feat': Scheme(shape=(), dtype=torch.float32), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)})

In [None]:
def train(g, model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    #all_logits = []
    best_val_acc = 0
    best_test_acc = 0
    
    features = g.ndata['feat']
    edge_label = g.edata['label'].type(torch.float32)
    train_mask = g.edata['train_mask']
    val_mask = g.edata['val_mask']
    test_mask = g.edata['test_mask']
    
    score = DotPredictor().cuda()
    opt = torch.optim.Adam(model.parameters())
    for epoch in range(10000):
        pred = model(g, features)
        logits = score(g, pred)
        loss = F.binary_cross_entropy_with_logits(logits[train_mask], edge_label[train_mask])
        # Compute prediction
        pred = (logits.detach()>0.0).type(torch.int32)

        # Compute loss
        # Note that we should only compute the losses of the nodes in the training set,
        # i.e. with train_mask 1.
       

        # Compute accuracy on training/validation/test
        train_acc = (pred[train_mask] == edge_label[train_mask]).float().mean()
        val_acc = (pred[val_mask] == edge_label[val_mask]).float().mean()
        test_acc = (pred[test_mask] == edge_label[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #all_logits.append(logits.detach())

        if epoch % 50 == 0:
            print('In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})'.format(
                epoch, loss, val_acc, best_val_acc, test_acc, best_test_acc))


    
    
model = Model(g.ndata['feat'].shape[1], 16, 2).to(device)
train(g.to(device), model)

In epoch 0, loss: 0.702, val acc: 0.198 (best 0.198), test acc: 0.198 (best 0.198)
In epoch 50, loss: 0.625, val acc: 0.652 (best 0.652), test acc: 0.652 (best 0.652)
In epoch 100, loss: 0.598, val acc: 0.667 (best 0.667), test acc: 0.667 (best 0.667)
In epoch 150, loss: 0.589, val acc: 0.677 (best 0.677), test acc: 0.677 (best 0.677)
In epoch 200, loss: 0.580, val acc: 0.686 (best 0.686), test acc: 0.686 (best 0.686)
In epoch 250, loss: 0.574, val acc: 0.691 (best 0.691), test acc: 0.691 (best 0.691)
In epoch 300, loss: 0.569, val acc: 0.695 (best 0.695), test acc: 0.695 (best 0.696)
In epoch 350, loss: 0.563, val acc: 0.700 (best 0.701), test acc: 0.701 (best 0.701)
In epoch 400, loss: 0.559, val acc: 0.701 (best 0.701), test acc: 0.701 (best 0.701)
In epoch 450, loss: 0.557, val acc: 0.701 (best 0.701), test acc: 0.701 (best 0.701)
In epoch 500, loss: 0.555, val acc: 0.701 (best 0.701), test acc: 0.701 (best 0.701)
In epoch 550, loss: 0.554, val acc: 0.701 (best 0.702), test acc: 0.

In [13]:
torch.__version__

'1.7.1'