## Preprocessing and cleaning 

In this notebook I explore the dataset and pre process it in order to generate a new dataset. In short the procedure contains
1) The creation of new columns  'Year', 'Month', 'Day', 'Hour', 'Minute', 'Currency Code',	'Rate',	'Amount Paid USD',	'Amount Received USD'
2) Standardise amount in different currencies to USD
 

In [1]:
import pandas as pd 
import numpy as np

In [2]:
from data import *
df = get_data_local()
df = clean_data(df)

✅ data received
✅ data cleaned


In [3]:
# Get number of rows
num_rows = len(df)
print("Number of rows: ", num_rows)

Number of rows:  4425434


In [4]:
df

Unnamed: 0,from_bank,from_account,to_bank,to_account,receiving_currency,payment_currency,payment_format,is_laundering,amount_paid_USD,amount_received_USD,month,day,hour,minute
0,3208,8000F4580,1,8000F5340,USD,USD,Cheque,0,0.010000,0.010000,9,1,0,20
1,12,8000EC280,2439,8017BF800,USD,USD,Credit Card,0,7.660000,7.660000,9,1,0,26
2,1,8000EDEC0,211050,80AEF5310,USD,USD,Credit Card,0,383.709991,383.709991,9,1,0,21
3,1,8000F4510,11813,8011305D0,USD,USD,Credit Card,0,9.820000,9.820000,9,1,0,4
4,1,8000F4FE0,245335,812ED62E0,USD,USD,Credit Card,0,4.010000,4.010000,9,1,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4425429,54219,8148A6631,256398,8148A8711,XBT,XBT,Bitcoin,0,0.154978,0.154978,9,10,23,57
4425430,15,8148A8671,256398,8148A8711,XBT,XBT,Bitcoin,0,0.108128,0.108128,9,10,23,35
4425431,154365,8148A6771,256398,8148A8711,XBT,XBT,Bitcoin,0,0.004988,0.004988,9,10,23,52
4425432,256398,8148A6311,256398,8148A8711,XBT,XBT,Bitcoin,0,0.038417,0.038417,9,10,23,46


In [56]:
# Column names and data type
df.dtypes

timestamp              datetime64[ns]
from_bank                       int64
from_account                   object
to_bank                         int64
to_account                     object
payment_format                 object
is_laundering                   int64
amount_paid_USD               float64
amount_received_USD           float64
currency_pair                  object
year                            int32
month                           int32
day                             int32
hour                            int32
minute                          int32
dtype: object

In [57]:
#How many unique accounts there are
unique_count = df['from_account'].nunique()
print('Unique count of Account: ', unique_count)


Unique count of Account:  309706


In [58]:
# Get the distribution of Is Laundering 
laundering_counts = df['is_laundering'].value_counts(normalize=True)
print(laundering_counts)

is_laundering
0    0.99883
1    0.00117
Name: proportion, dtype: float64


In [59]:
df.columns

Index(['timestamp', 'from_bank', 'from_account', 'to_bank', 'to_account',
       'payment_format', 'is_laundering', 'amount_paid_USD',
       'amount_received_USD', 'currency_pair', 'year', 'month', 'day', 'hour',
       'minute'],
      dtype='object')

In [60]:
# Identify Unique Currencies
unique_currencies = df['currency_pair'].unique()
print(unique_currencies)

['USD_USD' 'XBT_XBT' 'EUR_USD' 'EUR_EUR' nan 'USD_EUR' 'USD_CNY' 'CNY_USD'
 'CNY_CNY' 'INR_USD' 'INR_INR' 'USD_JPY' 'USD_INR' 'MXN_USD' 'MXN_MXN'
 'GBP_USD' 'GBP_GBP' 'USD_CAD' 'USD_GBP' 'USD_MXN' 'CAD_USD' 'CAD_CAD'
 'CHF_USD' 'CHF_CHF' 'JPY_USD' 'JPY_JPY' 'USD_CHF' 'XBT_USD' 'USD_RUB'
 'JPY_EUR' 'BRL_EUR' 'BRL_BRL' 'GBP_EUR' 'MXN_EUR' 'CNY_EUR' 'EUR_JPY'
 'EUR_CNY' 'EUR_INR' 'EUR_CAD' 'ILS_EUR' 'ILS_ILS' 'EUR_BRL' 'EUR_GBP'
 'RUB_EUR' 'RUB_RUB' 'EUR_CHF' 'CNY_INR' 'CNY_JPY' 'MXN_CNY' 'CNY_SAR'
 'CNY_ILS' 'CAD_JPY' 'JPY_CNY' 'JPY_GBP' 'JPY_XBT' 'INR_CNY' 'INR_EUR'
 'INR_JPY' 'ILS_INR' 'RUB_USD' 'RUB_INR' 'EUR_RUB' 'CNY_GBP' 'GBP_JPY'
 'GBP_CNY' 'GBP_INR' 'CAD_CNY' 'CAD_RUB' 'CAD_EUR' 'GBP_CAD' 'RUB_CAD'
 'MXN_CAD' 'CNY_MXN' 'MXN_JPY' 'EUR_MXN' 'MXN_RUB' 'MXN_GBP' 'BRL_USD'
 'JPY_BRL' 'BRL_CNY' 'USD_BRL' 'CHF_INR' 'CHF_EUR' 'CHF_CAD' 'CHF_GBP'
 'CAD_CHF' 'CHF_CNY' 'GBP_CHF' 'ILS_USD' 'EUR_ILS' 'ILS_CNY' 'ILS_JPY'
 'CAD_ILS' 'ILS_BRL' 'ILS_GBP' 'ILS_CAD' 'SAR_SAR' 'SAR_USD' 'SAR_RUB'
 '

In [38]:
# Get the distribution the percentual distribution of flagged transactions by currency 

# Group by payment_currency and Is_Laundering, count the number of rows
distribution = df.groupby(['currency_pair', 'is_laundering']).size().reset_index(name='Count')

# Calculate the total counts per currency and name it
total_counts = df.groupby('currency_pair').size()
total_counts.name = 'Count_total'

# Join the total_counts to the distribution dataframe
distribution = distribution.set_index('currency_pair').join(total_counts)

# Calculate the percentage
distribution['Percent'] = distribution['Count'] / distribution['Count_total'] * 100

# Reset the index
distribution.reset_index(inplace=True)
print(distribution)

    currency_pair  is_laundering   Count  Count_total     Percent
0         BRL_BRL              0   61219        61276   99.906978
1         BRL_BRL              1      57        61276    0.093022
2         BRL_CAD              0      23           23  100.000000
3         BRL_CHF              0       2            2  100.000000
4         BRL_CNY              0      57           57  100.000000
..            ...            ...     ...          ...         ...
196       XBT_RUB              0       1            1  100.000000
197       XBT_SAR              0       4            4  100.000000
198       XBT_USD              0    1099         1099  100.000000
199       XBT_XBT              0  145957       146013   99.961647
200       XBT_XBT              1      56       146013    0.038353

[201 rows x 5 columns]


In [39]:
# Convert Timestamp into datetime
#df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Create new columns for year, month, day, hour and minute
#df['Year'] = df['Timestamp'].dt.year
#df['Month'] = df['Timestamp'].dt.month
#df['Day'] = df['Timestamp'].dt.day
#df['Hour'] = df['Timestamp'].dt.hour
#df['Minute'] = df['Timestamp'].dt.minute


In [40]:
# Let's inspect the dataframe
df.head()

Unnamed: 0,timestamp,from_bank,from_account,to_bank,to_account,payment_format,is_laundering,amount_paid_USD,amount_received_USD,currency_pair,year,month,day,hour,minute
0,2022-09-01 00:20:00,3208,8000F4580,1,8000F5340,Cheque,0,0.01,0.01,USD_USD,2022,9,1,0,20
1,2022-09-01 00:26:00,12,8000EC280,2439,8017BF800,Credit Card,0,7.66,7.66,USD_USD,2022,9,1,0,26
2,2022-09-01 00:21:00,1,8000EDEC0,211050,80AEF5310,Credit Card,0,383.709991,383.709991,USD_USD,2022,9,1,0,21
3,2022-09-01 00:04:00,1,8000F4510,11813,8011305D0,Credit Card,0,9.82,9.82,USD_USD,2022,9,1,0,4
4,2022-09-01 00:08:00,1,8000F4FE0,245335,812ED62E0,Credit Card,0,4.01,4.01,USD_USD,2022,9,1,0,8


In [262]:
# Now let's standardise the Amounts to US Dollar 

# First let' create a dictionary to map full currency names to their  ISO codes
#currency_codes = {
#    'Australian Dollar': 'AUD',
#    'Bitcoin': 'BTC',
#    'Brazil Real': 'BRL',
#    'Canadian Dollar': 'CAD',
#    'Euro': 'EUR',
#    'Mexican Peso': 'MXN',
#    'Ruble': 'RUB',
#    'Rupee': 'INR',
#    'Saudi Riyal': 'SAR',
#    'Shekel': 'ILS',
#    'Swiss Franc': 'CHF',
#    'UK Pound': 'GBP',
#    'Yen': 'JPY',
#    'Yuan': 'CNY',
#    'US Dollar': 'USD'
#}

# Replace the full currency names with their codes 
#df['Currency Code'] = df['Payment Currency'].replace(currency_codes)


In [20]:
#import requests
# Get the exchange rate for all the currencies by connecting to Exchange Rate Data API

# Exchange rate data api key
#api_key = "wdiQOxfDckJMZr70O1Brmvlh56iJEfE7"

# Arbitrary date to fetch the exchange rates
#date = "2022-09-30"

# The list of currency codes to fetch the exchange rates
#currency_codes = ["GBP", "EUR", "AUD", "BTC", "BRL", "CAD", "MXN", "RUB", "INR", "SAR", "ILS", "CHF", "JPY", "CNY"]

# URL
#url = f"https://api.apilayer.com/exchangerates_data/{date}?symbols={','.join(currency_codes)}&base=USD"

# Define the headers
#headers = {
#  "apikey": api_key
#}

# Send a GET request to the API
#response = requests.get(url, headers=headers)

# Convert the response to JSON
#data = response.json()

# Create a DataFrame from the rates

#df_rates = pd.DataFrame(data['rates'].items(), columns=['Currency Code', 'Rate'])

# Display the DataFrame
#print(df_rates)


In [264]:
# Merge df with df_rates on 'Currency Code', preserving all rows from df and filling in NaN for missing match
#df = df.merge(df_rates, on='Currency Code', how='left')

# Wherever the rate is NaN, that means the currency was USD. We can fill those with 1.
#df['Rate'] = df['Rate'].fillna(1)


In [265]:
# Compute 'Amount Paid USD' and 'Amount Received USD'
#df['Amount Paid USD'] = df['Amount Paid'] * df['Rate']
#df['Amount Received USD'] = df['Amount Received'] * df['Rate']

In [41]:
df

Unnamed: 0,timestamp,from_bank,from_account,to_bank,to_account,payment_format,is_laundering,amount_paid_USD,amount_received_USD,currency_pair,year,month,day,hour,minute
0,2022-09-01 00:20:00,3208,8000F4580,1,8000F5340,Cheque,0,0.010000,0.010000,USD_USD,2022,9,1,0,20
1,2022-09-01 00:26:00,12,8000EC280,2439,8017BF800,Credit Card,0,7.660000,7.660000,USD_USD,2022,9,1,0,26
2,2022-09-01 00:21:00,1,8000EDEC0,211050,80AEF5310,Credit Card,0,383.709991,383.709991,USD_USD,2022,9,1,0,21
3,2022-09-01 00:04:00,1,8000F4510,11813,8011305D0,Credit Card,0,9.820000,9.820000,USD_USD,2022,9,1,0,4
4,2022-09-01 00:08:00,1,8000F4FE0,245335,812ED62E0,Credit Card,0,4.010000,4.010000,USD_USD,2022,9,1,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4425429,2022-09-10 23:57:00,54219,8148A6631,256398,8148A8711,Bitcoin,0,0.154978,0.154978,XBT_XBT,2022,9,10,23,57
4425430,2022-09-10 23:35:00,15,8148A8671,256398,8148A8711,Bitcoin,0,0.108128,0.108128,XBT_XBT,2022,9,10,23,35
4425431,2022-09-10 23:52:00,154365,8148A6771,256398,8148A8711,Bitcoin,0,0.004988,0.004988,XBT_XBT,2022,9,10,23,52
4425432,2022-09-10 23:46:00,256398,8148A6311,256398,8148A8711,Bitcoin,0,0.038417,0.038417,XBT_XBT,2022,9,10,23,46


In [42]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
ohe.fit(df[['payment_format']])
df[ohe.get_feature_names_out()] = ohe.transform(df[['payment_format']])

#ohe.fit(df[['currency_pair']])
#df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])




In [52]:
df['currency_pair'].nunique()

187

In [43]:
ohe.fit(df[['currency_pair']])
df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])


  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['currency_pair']])
  df[ohe.get_feature_names_out()] = ohe.transform(d

KeyboardInterrupt: 

In [269]:
df_temp = df.head(500000)

In [270]:
df_temp

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,...,Currency Code_CNY,Currency Code_EUR,Currency Code_GBP,Currency Code_ILS,Currency Code_INR,Currency Code_JPY,Currency Code_MXN,Currency Code_RUB,Currency Code_SAR,Currency Code_USD
0,2022-09-01 00:20:00,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2022-09-01 00:20:00,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2022-09-01 00:00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2022-09-01 00:02:00,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2022-09-01 00:06:00,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,2022-09-01 05:57:00,23402,810DF3390,23402,810DF3390,16.32,US Dollar,16.32,US Dollar,Reinvestment,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
499996,2022-09-01 05:46:00,28694,810DF7B90,28694,810DF7B90,21.96,US Dollar,21.96,US Dollar,Reinvestment,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
499997,2022-09-01 05:41:00,220271,8087455E0,132722,810DFA0A0,5867.31,US Dollar,5867.31,US Dollar,Cheque,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
499998,2022-09-01 05:41:00,220271,8087455E0,132722,810DFA0A0,6697.05,US Dollar,6697.05,US Dollar,Credit Card,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [271]:
! pip install networkx
import networkx as nx
G = nx.MultiGraph()

# Add nodes to the graph for each unique card_id, merchant_name
G.add_nodes_from(df_temp["Account"].unique(), type='Account')
G.add_nodes_from(df_temp["Account.1"].unique(), type='Account.1')



In [272]:
len(df_temp["Account"].unique())

298058

In [273]:
df_temp = df.head(10000)

In [274]:
for _, row in df_temp.iterrows():
    # Create a variable for each properties for each edge
    
        year = row["Year"],
        month = row["Month"],
        day = row["Day"],
        hour = row["Hour"],
        minute =row["Minute"],
        amount_paid = row["Amount Paid USD"],
        payment_format_ach =  row["Payment Format_ACH"],
        payment_format_bitcoin = row["Payment Format_Bitcoin"],
        payment_format_cash = row["Payment Format_Cash"],
        Payment_format_Cheque = row["Payment Format_Cheque"],
        Payment_Format_Credit_Card = row["Payment Format_Credit Card"],
        Payment_Format_Reinvestment = row["Payment Format_Reinvestment"],
        Payment_Format_Wire =row["Payment Format_Wire"],
        Currency_Code_BRL =  row["Currency Code_BRL"],
        Currency_Code_BTC = row["Currency Code_BTC"],
        Currency_Code_CHF = row["Currency Code_CHF"],
        Currency_Code_EUR = row["Currency Code_EUR"],
        Currency_Code_GBP = row["Currency Code_GBP"],
        Currency_Code_ILS =row["Currency Code_ILS"],
        Currency_Code_INR = row["Currency Code_INR"],
        Currency_Code_JPY =  row["Currency Code_JPY"],
        Currency_Code_MXN = row["Currency Code_MXN"],
        Currency_Code_RUB =row["Currency Code_RUB"],
        Currency_Code_SAR = row["Currency Code_SAR"],
        Currency_Code_USD =  row["Currency Code_USD"]
 
        G.add_edge(row['Account'], row['Account.1'], year = year , month = month , day = day ,
              hour = hour , minute = minute , amount_paid = amount_paid, payment_format_ach =  payment_format_ach,
              payment_format_bitcoin = payment_format_bitcoin, 
            payment_format_cash = payment_format_cash,
            Payment_format_Cheque = Payment_format_Cheque,
            Payment_Format_Credit_Card = Payment_Format_Credit_Card,
            Payment_Format_Reinvestment = Payment_Format_Reinvestment,
            Payment_Format_Wire = Payment_Format_Wire,
            Currency_Code_BRL =  Currency_Code_BRL,
            Currency_Code_BTC = Currency_Code_BTC,
            Currency_Code_CHF = Currency_Code_CHF,
            Currency_Code_EUR = Currency_Code_EUR,
            Currency_Code_GBP = Currency_Code_GBP,
            Currency_Code_ILS = Currency_Code_ILS,
            Currency_Code_INR = Currency_Code_INR,
            Currency_Code_JPY =  Currency_Code_JPY,
            Currency_Code_MXN = Currency_Code_MXN,
            Currency_Code_RUB = Currency_Code_RUB,
            Currency_Code_SAR = Currency_Code_SAR,
            Currency_Code_USD =  Currency_Code_USD)
  

In [275]:
# Get the number of nodes and edges in the graph
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

# Print the number of nodes and edges
print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)

Number of nodes: 326815
Number of edges: 10000


In [276]:
# Convert the graph to an adjacency matrix
adj_matrix = nx.adjacency_matrix(G).todense()

In [277]:
adj_matrix.shape

(326815, 326815)

In [278]:
# Get a small sample of the nodes in the graph
sample_nodes = list(G.nodes())[:10]

# Retrieve the properties of the sample nodes
node_properties = nx.get_node_attributes(G, 'type')

# Print the properties of the sample nodes
for node in sample_nodes:
    print(f"Node: {node}, Properties: {node_properties[node]}")

Node: 8000EBD30, Properties: Account.1
Node: 8000F4580, Properties: Account
Node: 8000F4670, Properties: Account.1
Node: 8000F5030, Properties: Account.1
Node: 8000F5200, Properties: Account.1
Node: 8000F5AD0, Properties: Account.1
Node: 8000EBAC0, Properties: Account.1
Node: 8000EC1E0, Properties: Account.1
Node: 8000EC280, Properties: Account
Node: 8000EDEC0, Properties: Account


In [279]:
sample_size = 5
for i, edge in enumerate(G.edges()):
    print(G.get_edge_data(*edge))
    if i >= sample_size - 1:
        break

{0: {'year': (2022,), 'month': (9,), 'day': (1,), 'hour': (0,), 'minute': (20,), 'amount_paid': (3697.34,), 'payment_format_ach': (0.0,), 'payment_format_bitcoin': (0.0,), 'payment_format_cash': (0.0,), 'Payment_format_Cheque': (0.0,), 'Payment_Format_Credit_Card': (0.0,), 'Payment_Format_Reinvestment': (1.0,), 'Payment_Format_Wire': (0.0,), 'Currency_Code_BRL': (0.0,), 'Currency_Code_BTC': (0.0,), 'Currency_Code_CHF': (0.0,), 'Currency_Code_EUR': (0.0,), 'Currency_Code_GBP': (0.0,), 'Currency_Code_ILS': (0.0,), 'Currency_Code_INR': (0.0,), 'Currency_Code_JPY': (0.0,), 'Currency_Code_MXN': (0.0,), 'Currency_Code_RUB': (0.0,), 'Currency_Code_SAR': (0.0,), 'Currency_Code_USD': 1.0}}
{0: {'year': (2022,), 'month': (9,), 'day': (1,), 'hour': (0,), 'minute': (20,), 'amount_paid': (0.01,), 'payment_format_ach': (0.0,), 'payment_format_bitcoin': (0.0,), 'payment_format_cash': (0.0,), 'Payment_format_Cheque': (1.0,), 'Payment_Format_Credit_Card': (0.0,), 'Payment_Format_Reinvestment': (0.0,), 

In [280]:
import collections
# Retrieve the properties errors of all the edges
edge_properties = nx.get_edge_attributes(G, 'errors')

# Count the number of edges by property value
edge_count_by_property = collections.Counter(edge_properties.values())

# Print the count of edges by property value
for property_value, count in edge_count_by_property.items():
    print(f"Property value: {property_value}, Count: {count}")

In [281]:
# Prepare the data for input into the model
edge_list = list(G.edges(data=True))


In [282]:
list(edge_list[i][2].values())

[(2022,),
 (9,),
 (1,),
 (0,),
 (6,),
 (36682.97,),
 (0.0,),
 (0.0,),
 (0.0,),
 (0.0,),
 (0.0,),
 (1.0,),
 (0.0,),
 (0.0,),
 (0.0,),
 (0.0,),
 (0.0,),
 (0.0,),
 (0.0,),
 (0.0,),
 (0.0,),
 (0.0,),
 (0.0,),
 (0.0,),
 1.0]

In [283]:

import torch
import torch.nn as nn
import torch.nn.functional as F

class FraudGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(FraudGNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x.squeeze(-1)

# Prepare the data for input into the model
edge_list = list(G.edges(data=True))
x = []
for edge in edge_list:
    edge_values = list(edge[2].values())
    print(edge_values)
    edge_values = [float(i[0]) if type(i) == tuple and type(i[0]) == str else i[0] if type(i) == tuple else i for i in edge_values]
    x.append(edge_values)
x = torch.tensor(np.array(x), dtype=torch.float)

[(2022,), (9,), (1,), (0,), (20,), (3697.34,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (1.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), 1.0]
[(2022,), (9,), (1,), (0,), (20,), (0.01,), (0.0,), (0.0,), (0.0,), (1.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), 1.0]
[(2022,), (9,), (1,), (0,), (0,), (14675.57,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (1.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), 1.0]
[(2022,), (9,), (1,), (0,), (2,), (2806.97,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (1.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), 1.0]
[(2022,), (9,), (1,), (0,), (6,), (36682.97,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (1.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), (0.0,), 1.0]
[(2022,), (9,), (1,), (0

In [284]:
test = pd.DataFrame(x)
test.shape

(10000, 25)

In [285]:
target = torch.tensor(df_temp['Is Laundering'].values, dtype=torch.float)

In [286]:
x

tensor([[2.0220e+03, 9.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         1.0000e+00],
        [2.0220e+03, 9.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         1.0000e+00],
        [2.0220e+03, 9.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         1.0000e+00],
        ...,
        [2.0220e+03, 9.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         1.0000e+00],
        [2.0220e+03, 9.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         1.0000e+00],
        [2.0220e+03, 9.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         1.0000e+00]])

In [287]:
# Define the model
input_dim = len(x[0])
hidden_dim = 16
model = FraudGNN(input_dim, hidden_dim)
num_epochs=201

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [288]:
for i in range(num_epochs):
    # Forward pass
    output = model(x)
    # Compute the loss
    loss = criterion(output, target)
    if i % 20 == 0:
        print(f'Epoch: {i}, Loss: {loss.item()}')
    # Zero the gradients
    optimizer.zero_grad()
    # Perform backpropagation
    loss.backward()
    # Update the parameters
    optimizer.step()
    

Epoch: 0, Loss: 941.5491943359375
Epoch: 20, Loss: 0.1451844722032547
Epoch: 40, Loss: 0.15871749818325043
Epoch: 60, Loss: 0.15956349670886993
Epoch: 80, Loss: 0.15825606882572174
Epoch: 100, Loss: 0.15646234154701233
Epoch: 120, Loss: 0.15444718301296234
Epoch: 140, Loss: 0.15226297080516815
Epoch: 160, Loss: 0.14992870390415192
Epoch: 180, Loss: 0.1474568247795105
Epoch: 200, Loss: 0.1448569893836975
