In [1]:
import sys

sys.path.insert(0, "..")

In [2]:
import pandas as pd
import numpy as np
import torch
import logging
import itertools
import argparse
from data_util import GraphData, HeteroData, z_norm, create_hetero_obj

import json

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
with open('../data_config.json', 'r') as config_file:
    data_config = json.load(config_file)

# Access values of widgets
args = argparse.Namespace(
    data='Small_HI',
    ports=False,
    tds=False,
    model='gin',
    reverse_mp=False,
    num_neighs=[100,100],
    batch_size=8192
)

In [4]:
transaction_file = f"{data_config['paths']['aml_data']}/{args.data}/formatted_transactions.csv" #replace this with your path to the respective AML data objects

In [5]:
df_edges = pd.read_csv(transaction_file)

In [6]:
print(f'Available Edge Features: {df_edges.columns.tolist()}')
df_edges

Available Edge Features: ['EdgeID', 'from_id', 'to_id', 'Timestamp', 'Amount Sent', 'Sent Currency', 'Amount Received', 'Received Currency', 'Payment Format', 'Is Laundering']


Unnamed: 0,EdgeID,from_id,to_id,Timestamp,Amount Sent,Sent Currency,Amount Received,Received Currency,Payment Format,Is Laundering
0,2,3,3,10,14675.57,0,14675.57,0,0,0
1,17,24,24,10,897.37,0,897.37,0,0,0
2,158,163,163,10,99986.94,0,99986.94,0,0,0
3,218,215,215,10,16.08,0,16.08,0,0,0
4,281,265,265,10,10.30,0,10.30,0,0,0
...,...,...,...,...,...,...,...,...,...,...
5078340,4962230,71717,20395,1504930,3749.14,0,3749.14,0,3,1
5078341,4962231,71717,71717,1509490,2091.95,0,1785.27,2,3,0
5078342,4962232,71717,131619,1509490,1785.27,2,1785.27,2,3,1
5078343,4962233,71717,273443,1515490,2154.54,0,2154.54,0,3,1


In [7]:
df_edges.describe()

Unnamed: 0,EdgeID,from_id,to_id,Timestamp,Amount Sent,Sent Currency,Amount Received,Received Currency,Payment Format,Is Laundering
count,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0
mean,2539172.0,185094.7,207749.0,371778.2,4509273.0,3.529483,5988726.0,3.557052,1.971657,0.001019427
std,1465992.0,127555.4,119976.7,269860.6,869772800.0,4.242403,1037183000.0,4.256171,1.419516,0.03191219
min,0.0,0.0,0.0,10.0,1e-06,0.0,1e-06,0.0,0.0,0.0
25%,1269586.0,77849.0,104462.0,102730.0,184.48,0.0,183.37,0.0,1.0,0.0
50%,2539172.0,185539.0,210709.0,389770.0,1414.54,2.0,1411.01,2.0,2.0,0.0
75%,3808758.0,281755.0,305695.0,616390.0,12297.84,6.0,12346.27,6.0,3.0,0.0
max,5078344.0,515087.0,515081.0,1527490.0,1046302000000.0,14.0,1046302000000.0,14.0,6.0,1.0


In [8]:
df_edges['Timestamp'] = df_edges['Timestamp'] - df_edges['Timestamp'].min()

In [9]:
# Find the maximum node ID by taking the maximum value from 'from_id' and 'to_id' columns and adding 1
max_n_id = df_edges.loc[:, ['from_id', 'to_id']].to_numpy().max() + 1

# Create a DataFrame for nodes with NodeID and Feature columns initialized with ones
df_nodes = pd.DataFrame({'NodeID': np.arange(max_n_id), 'Feature': np.ones(max_n_id)})

timestamps = torch.Tensor(df_edges['Timestamp'].to_numpy())
y = torch.LongTensor(df_edges['Is Laundering'].to_numpy())

print(f"Illicit ratio = {sum(y)} / {len(y)} = {sum(y) / len(y) * 100:.2f}%")
print(f"Number of nodes (holdings doing transcations) = {df_nodes.shape[0]}")
print(f"Number of transactions = {df_edges.shape[0]}")

Illicit ratio = 5177 / 5078345 = 0.10%
Number of nodes (holdings doing transcations) = 515088
Number of transactions = 5078345


In [1]:
import random
df_edges.loc[:, ['from_id']] = random.sample(range(0, 100), df_edges.shape[0]) #, 'to_id']] 



SyntaxError: invalid syntax (1151133521.py, line 3)

In [10]:
edge_features = ['Timestamp', 'Amount Received', 'Received Currency', 'Payment Format']
node_features = ['Feature']

print(f'Edge features being used: {edge_features}')
print(f'Node features being used: {node_features} ("Feature" is a placeholder feature of all 1s)')

Edge features being used: ['Timestamp', 'Amount Received', 'Received Currency', 'Payment Format']
Node features being used: ['Feature'] ("Feature" is a placeholder feature of all 1s)


In [11]:
x = torch.tensor(df_nodes.loc[:, node_features].to_numpy()).float()
edge_index = torch.LongTensor(df_edges.loc[:, ['from_id', 'to_id']].to_numpy().T) # (2 x num_edges)
edge_attr = torch.tensor(df_edges.loc[:, edge_features].to_numpy()).float() # (num_edges x num_edge_features)

In [12]:
n_days = int(timestamps.max() / (3600 * 24) + 1)
n_samples = y.shape[0]
print(f'number of days and transactions in the data: {n_days} days, {n_samples} transactions')

#data splitting
daily_irs, weighted_daily_irs, daily_inds, daily_trans = [], [], [], [] #irs = illicit ratios, inds = indices, trans = transactions
for day in range(n_days):
    l = day * 24 * 3600
    r = (day + 1) * 24 * 3600
    day_inds = torch.where((timestamps >= l) & (timestamps < r))[0]
    daily_irs.append(y[day_inds].float().mean())
    weighted_daily_irs.append(y[day_inds].float().mean() * day_inds.shape[0] / n_samples)
    daily_inds.append(day_inds)
    daily_trans.append(day_inds.shape[0])

split_per = [0.6, 0.2, 0.2]
daily_totals = np.array(daily_trans)
d_ts = daily_totals
I = list(range(len(d_ts)))
split_scores = dict()
for i,j in itertools.combinations(I, 2):
    if j >= i:
        split_totals = [d_ts[:i].sum(), d_ts[i:j].sum(), d_ts[j:].sum()]
        split_totals_sum = np.sum(split_totals)
        split_props = [v/split_totals_sum for v in split_totals]
        split_error = [abs(v-t)/t for v,t in zip(split_props, split_per)]
        score = max(split_error) #- (split_totals_sum/total) + 1
        split_scores[(i,j)] = score
    else:
        continue
i,j = min(split_scores, key=split_scores.get)
#split contains a list for each split (train, validation and test) and each list contains the days that are part of the respective split
split = [list(range(i)), list(range(i, j)), list(range(j, len(daily_totals)))]
print(f'Calculate split: {split}')

#Now, we seperate the transactions based on their indices in the timestamp array
split_inds = {k: [] for k in range(3)}
for i in range(3):
    for day in split[i]:
        split_inds[i].append(daily_inds[day]) #split_inds contains a list for each split (tr,val,te) which contains the indices of each day seperately


number of days and transactions in the data: 18 days, 5078345 transactions


Calculate split: [[0, 1, 2, 3, 4, 5], [6, 7], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17]]


In [13]:
tr_inds = torch.cat(split_inds[0])
val_inds = torch.cat(split_inds[1])
te_inds = torch.cat(split_inds[2])

print(f"Total train samples: {tr_inds.shape[0] / y.shape[0] * 100 :.2f}% || IR: "
        f"{y[tr_inds].float().mean() * 100 :.2f}% || Train days: {split[0][:5]}")
print(f"Total val samples: {val_inds.shape[0] / y.shape[0] * 100 :.2f}% || IR: "
    f"{y[val_inds].float().mean() * 100:.2f}% || Val days: {split[1][:5]}")
print(f"Total test samples: {te_inds.shape[0] / y.shape[0] * 100 :.2f}% || IR: "
    f"{y[te_inds].float().mean() * 100:.2f}% || Test days: {split[2][:5]}")

#Creating the final data objects
tr_x, val_x, te_x = x, x, x
e_tr = tr_inds.numpy()
e_val = np.concatenate([tr_inds, val_inds])

tr_edge_index,  tr_edge_attr,  tr_y,  tr_edge_times  = edge_index[:,e_tr],  edge_attr[e_tr],  y[e_tr],  timestamps[e_tr]
val_edge_index, val_edge_attr, val_y, val_edge_times = edge_index[:,e_val], edge_attr[e_val], y[e_val], timestamps[e_val]
te_edge_index,  te_edge_attr,  te_y,  te_edge_times  = edge_index,          edge_attr,        y,        timestamps


Total train samples: 63.98% || IR: 0.08% || Train days: [0, 1, 2, 3, 4]
Total val samples: 19.01% || IR: 0.11% || Val days: [6, 7]
Total test samples: 17.01% || IR: 0.19% || Test days: [8, 9, 10, 11, 12]


In [14]:
import copy

def fake_trans_1_old_1_new(te_edge_index):
    edge_index = copy.deepcopy(te_edge_index)

    max_id = edge_index.max() + 10000

    random_vals = torch.randint(0, edge_index.max(), size=[edge_index.shape[1]])
    mask = np.random.random(size=edge_index.shape[1]) > 0.5
    edge_index[0, :] += max_id
    edge_index[1, :] += max_id
    # edge_index[1, mask] = random_vals[mask]

    # edge_index[1, ~mask] += max_id
    # edge_index[0, ~mask] = random_vals[~mask]

    return edge_index

In [15]:
new_te_edge_index = fake_trans_1_old_1_new(te_edge_index)

In [16]:
te_edge_index.shape

torch.Size([2, 5078345])

In [17]:
print(edge_index[:,e_tr].shape)
print(edge_index[:,e_val].shape)
print(edge_index.shape)

torch.Size([2, 3248921])
torch.Size([2, 4214445])
torch.Size([2, 5078345])


In [18]:
tr_data = GraphData (x=tr_x,  y=tr_y,  edge_index=tr_edge_index,  edge_attr=tr_edge_attr,  timestamps=tr_edge_times )
val_data = GraphData(x=val_x, y=val_y, edge_index=val_edge_index, edge_attr=val_edge_attr, timestamps=val_edge_times)
te_data = GraphData (x=te_x,  y=te_y,  edge_index=new_te_edge_index,  edge_attr=te_edge_attr,  timestamps=te_edge_times )


In [19]:
tr_data.edge_attr[:, :-1], val_data.edge_attr[:, :-1], te_data.edge_attr[:, :-1] = z_norm(tr_data.edge_attr[:, :-1]), z_norm(val_data.edge_attr[:, :-1]), z_norm(te_data.edge_attr[:, :-1])

In [20]:
from train_util import AddEgoIds, extract_param, add_arange_ids, get_loaders, evaluate_homo, evaluate_hetero

transform = None

#add the unique ids to later find the seed edges
add_arange_ids([tr_data, val_data, te_data])

# tr_loader, val_loader, te_loader = get_loaders(tr_data, val_data, te_data, tr_inds, val_inds, te_inds, transform, args)