In [69]:
# pip install networkx==2.6

# Data sample #1
Data from AML IBM kaggle repository

In [2]:
import spartan as st
import numpy as np
import pandas as pd

Using backend cpu


In [2]:
df_original = pd.read_csv("./inputData/LI-Small_Trans.csv")
df_original.head(4)

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:08,11,8000ECA90,11,8000ECA90,3195403.0,US Dollar,3195403.0,US Dollar,Reinvestment,0
1,2022/09/01 00:21,3402,80021DAD0,3402,80021DAD0,1858.96,US Dollar,1858.96,US Dollar,Reinvestment,0
2,2022/09/01 00:00,11,8000ECA90,1120,8006AA910,592571.0,US Dollar,592571.0,US Dollar,Cheque,0
3,2022/09/01 00:16,3814,8006AD080,3814,8006AD080,12.32,US Dollar,12.32,US Dollar,Reinvestment,0


In [4]:
# renaming fields
df = df_original.copy()
df.rename(columns = {'From Bank':'bankA',
                       'Account':'A',
                       'To Bank':'bankB',
                       'Account.1':'B',
                       'Amount Received':'sent1',
                       'Receiving Currency':'currency_sent',
                       'Payment Currency':'currency_recieved',
                       'Amount Paid':'sent2',
                       'Is Laundering':'ground_truth'}, inplace=True) #'70' #US Dollar

In [7]:
# checking % of fraud operations
df.ground_truth.value_counts(normalize = 'True')

ground_truth
0    0.999485
1    0.000515
Name: proportion, dtype: float64

In [4]:
# df = df.sample(frac=0.1, replace=True, random_state=1)

In [10]:
#Filtering only US dollar currency

filter_currency = (df['currency_sent'] == 'US Dollar') & (df['currency_recieved'] == 'US Dollar')
df = df[filter_currency]
df = df[['A','B','sent1','sent2','ground_truth']]

In [5]:
# generating two sub-graphs
accounts_A = list(df.A.values)
accounts_B = list(df.B.values)

in_accounts = [account for account in accounts_A if account not in accounts_B]
out_accounts = [account for account in accounts_A if account not in in_accounts]

In [11]:
df

Unnamed: 0,A,B,sent1,sent2,ground_truth
0,8000ECA90,8000ECA90,3195403.00,3195403.00,0
1,80021DAD0,80021DAD0,1858.96,1858.96,0
2,8000ECA90,8006AA910,592571.00,592571.00,0
3,8006AD080,8006AD080,12.32,12.32,0
4,8006AD530,8006AD530,2941.56,2941.56,0
...,...,...,...,...,...
6920625,801B97B80,81BE03AA0,1408.98,1408.98,0
6920626,801B97B80,81BE03AA0,1057.71,1057.71,0
6920627,801B97B80,81BE03AA0,464.87,464.87,0
6920628,81256E7B0,81BEE05E0,161.40,161.40,0


In [12]:
graph_1 = df[df['A'].isin(in_accounts)][['A','B','sent1','ground_truth']]
graph_2 = df[df['B'].isin(out_accounts)][['A','B','sent2','ground_truth']]

In [14]:
# Saving sub-graphs
graph_1.to_csv('./inputData/processed_graph_1.csv', sep='\t', index=False)
graph_2.to_csv('./inputData/processed_graph_2.csv', sep='\t', index=False)

## Sampling

In [23]:
# Sampling stratified due the computational resoruces
sample_franc = 0.0005

sample_1 = pd.read_csv("./inputData/processed_graph_1.csv", sep='\t')
sample_1['freq'] = sample_1.groupby('ground_truth')['ground_truth'].transform('count')
sample_1 = sample_1.sample(frac=sample_franc, weights=sample_1.freq, random_state=42)\
                   .reset_index(drop=True)
sample_1['A'] = sample_1['A'].str.extract('(\d+)').astype(int)
sample_1['B'] = sample_1['B'].str.extract('(\d+)').astype(int)
sample_1 = sample_1[['A','B','sent1']]


sample_2 = pd.read_csv("./inputData/processed_graph_2.csv", sep='\t')
sample_2['freq'] = sample_2.groupby('ground_truth')['ground_truth'].transform('count')
sample_2 = sample_2.sample(frac=sample_franc, weights=sample_2.freq, random_state=42)\
                   .reset_index(drop=True)
sample_2['A'] = sample_2['A'].str.extract('(\d+)').astype(int)
sample_2['B'] = sample_2['B'].str.extract('(\d+)').astype(int)

sample_2 = sample_2[['A','B','sent2']]

In [24]:
display(sample_1.shape)
display(sample_2.shape)

(73, 3)

(1183, 3)

In [25]:
sample_1.to_csv('./inputData/sample_1.csv', sep=',', index=False,header=False)
sample_2.to_csv('./inputData/sample_2.csv', sep=',', index=False,header=False)

# Data sample #2

In [34]:
df_original = pd.read_csv("./inputData/ML.csv")
display(df_original.head(4))
display(df_original.shape)

Unnamed: 0,typeofaction,sourceid,destinationid,amountofmoney,date,isfraud,typeoffraud
0,cash-in,30105,28942,494528,2019-07-19 14:40:00,1,type1
1,cash-in,30105,8692,494528,2019-05-17 14:57:00,1,type1
2,cash-in,30105,60094,494528,2019-07-20 13:20:00,1,type1
3,cash-in,30105,20575,494528,2019-07-03 14:15:00,1,type1


(2340, 7)

In [30]:
df = df_original.copy()

In [32]:
df.rename(columns = {
                       'sourceid':'A',
                       'destinationid':'B',
                       'amountofmoney':'sent1',
                       'isfraud':'ground_truth'}, inplace=True) 
df.ground_truth.value_counts(normalize = 'True')

ground_truth
1    0.597863
0    0.402137
Name: proportion, dtype: float64

In [33]:
# money in
accounts_A = list(df.A.values)
accounts_B = list(df.B.values)

in_accounts = [account for account in accounts_A if account not in accounts_B]
out_accounts = [account for account in accounts_A if account not in in_accounts]

In [39]:
graph_1 = df[df['A'].isin(in_accounts)][['A','B','sent1']]
graph_2 = df[df['B'].isin(out_accounts)][['A','B','sent1']]

In [40]:
graph_1.to_csv('./inputData/processed_ml_1.csv', sep=',', index=False,header=False)
graph_2.to_csv('./inputData/processed_ml_2.csv', sep=',', index=False,header=False)

In [8]:
#########
df = pd.read_csv('./inputData/processed_ml_1.csv', sep=',', header = None)

In [15]:
df[0] = df[0].astype('str') + 'A'

In [17]:
df.to_csv('./inputData/processed_ml_1_v2.csv', sep=',', index=False,header=False)

In [16]:
df

Unnamed: 0,0,1,2
0,80740A,29758,388294
1,80740A,47869,388294
2,80740A,79227,388294
3,80740A,68225,388294
4,80740A,65031,388294
...,...,...,...
1502,14945A,43793,106907
1503,9532A,43793,106907
1504,27332A,43793,106907
1505,32685A,43793,106907
